In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random

from TurkishStemmer import TurkishStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# Data exploration, first pass: more cleaning functions
This section creates a column with ads and whitespace removed (`fixed_text`) and a column with Turkish stopwords removed and words stemmed according to the `TurkishStemmer()` function (`processed_text`). These functions are added to the cleaning script. 

In [None]:
df = pd.read_csv('assets/lyrics/lyrics_scraped_20180621-134937.csv', index_col = 0)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('turkish')]
    
    # stemming
    stemmer = TurkishStemmer()
    
    text_processed = [stemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass

    return " ".join(text_processed) ## <-- we're keeping our words distinct

In [None]:
test_text = df['text'][34]

In [None]:
text_process(test_text)

In [None]:
df['processed_text'] = df['text'].map(text_process)

In [None]:
df['processed_text']
df['num_words'] = [len(x) for x in df['text'].str.split(" ")]

In [None]:
df.loc[1,['text', 'num_words']]['num_words']

In [None]:
df.sort_values('num_words', ascending=False)

In [None]:
test = df.loc[1246, 'text']

In [None]:
test

In [None]:
pattern = r'(eval.*])'

In [None]:
test = re.sub(pattern, "", test)
test = re.sub(r"\s+", " ", test)
test

In [None]:
def remove_ads(text):
    pattern = r'(eval.*])'
    return re.sub(pattern, "", text)

def regularize_whitespace(text):
    whitepattern = r'\s+'
    return re.sub(whitepattern, " ", text)

def remove_and_reg(text):
    text = remove_ads(text)
    text = regularize_whitespace(text)
    return text

In [None]:
df['fixed_text'] = df['text'].map(remove_and_reg)

In [None]:
df['fixed_text']

In [None]:
df[df['title'].str.find('Dost') > 0]

In [None]:
df.loc[1287, 'fixed_text'].strip()

In [None]:
df.loc[1287, 'text']

In [None]:
df[df['text'].str.contains(" les")]['text']

In [None]:
df.loc[449, 'fixed_text'].strip()

In [None]:
df.loc[449, 'text'].strip()

In [None]:
df[(df['artist'] == 'Candan Erçetin') & df['album'].str.contains("Aranjman")]

In [None]:
df.loc[1313,'text'].strip()

In [None]:
df.loc[1313, 'title']

In [None]:
for text in df.loc[1310:1320, 'fixed_text']:
    print(text)
    print('_-----_')

In [None]:
re.sub(r'[^A-Za-z ]', "", df.loc[1313,'text'])

In [None]:
set(df.loc[1313,'text'].strip().lower().split(" "))

# Playing with word2vec

In [None]:
from gensim.models import word2vec

In [None]:
f = "assets/lyrics/lyrics_scraped_20180621-134937.csv"
df = pd.read_csv(f, index_col=0)

In [None]:
df.shape

In [None]:
df['artist'].value_counts()

In [None]:
num_features = 400    # Word vector dimensionality                      
min_word_count = 15   # Minimum word count (seems to mean that if a word appears less than this number ignore it)                        
num_workers = 4       # Number of threads to run in parallel
context = 4          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [None]:
text = df['text'].map(remove_and_reg)

In [None]:
text = [i.lower().split(" ") for i in text]

In [None]:
model = word2vec.Word2Vec(text, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

In [None]:
voc = list(model.wv.vocab.keys())

In [None]:
voc

In [None]:
model.wv.most_similar('çocuklar', topn=30)