# EDA
Now that the data is cleaned, what's in it?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

In [2]:
%run ../scripts/post_scraping_text_processing.py functions

Text processing functions loaded.


## Reading in data
This notebook takes the output of the script `train_test_split.py`, which divides the data into training and test sets. Later this division proves to be unnecessary at this stage.

In [3]:
train = pd.read_csv('../assets/data/train.csv', index_col=0)
test = pd.read_csv('../assets/data/test.csv', index_col = 0)
df = pd.concat([train, test], axis = 0)

In [4]:
print(train.shape)
print(test.shape)
print(df.shape)

(11538, 6)
(3847, 6)
(15385, 6)


In [5]:
df.columns

Index(['artist', 'album', 'text', 'name', 'year', 'written_before'], dtype='object')

## Looking at the two classes separately
In this section:
- split the dataframe into two based on target class
- define a function that uses `CountVectorizer` to produce a word list with frequencies and proportions

In [6]:
precoup = df[df['written_before']==1]
postcoup = df[df['written_before'] ==0]

In [7]:
precoup_corpus = precoup['text'].copy()
postcoup_corpus = postcoup['text'].copy()

In [8]:
cvec_pre = CountVectorizer(stop_words=STOP_WORDS, min_df=2)

In [9]:
cvec_pre.fit(precoup_corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None,
        stop_words={'başkası', 'nasılsa', 'evvelce', 'mü', 'az', 'şura', 'çoğun', 'birisine', 'onları', 'ancak', 'kendilerine', 'hiçbirinde', 'dayanarak', 'çoğunda', 'binaen', 'birileri', 'ettiğini', 'şuracıkta', 'birisinin', 'şeyler', 'sizin', 'onda', 'yalnız', 'esasen', 'kendi', 'vasıtasıyla', 'nerdeyse',...mlemizden', 'daima', 'öbürüne', 'nere', 'şundan', 'nice', 'kimisinde', 'hiçbirine', 'bunda', 'gine'},
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
transformed = cvec_pre.transform(precoup_corpus)

In [11]:
pd.DataFrame(pd.DataFrame(transformed.toarray(), columns=cvec_pre.get_feature_names()).sum().sort_values(ascending =False), columns = ['count']).head(20)

Unnamed: 0,count
bir,6569
gel,1459
yar,1212
aman,989
ah,892
gün,870
aşk,716
oy,643
sensiz,565
güzel,545


In [12]:
def make_vocab_list(corpus, stopwords = None, ngram_range = (1,1), min_df = 1):
    """Takes a corpus and returns a one column dataframe with words in the index and counts in a column.
    
    Options:
        - stopwords: either a list-like or None (default)
        - ngram_range: a tuple (default (1,1))
        - min_df: minimum document frequency (default 1)"""
    cv = CountVectorizer(stop_words=stopwords, ngram_range=ngram_range, min_df = min_df)
    cv.fit(corpus)
    transformed = cv.transform(corpus)
    
    return pd.DataFrame(pd.DataFrame(transformed.toarray(), columns=cv.get_feature_names()).sum().sort_values(ascending =False), columns = ['count'])
    
    

In [13]:
words_precoup = make_vocab_list(precoup_corpus)
words_postcoup = make_vocab_list(postcoup_corpus)

In [14]:
total_precoup = words_precoup['count'].sum()
print(total_precoup)

total_postcoup = words_postcoup['count'].sum()
print(total_postcoup)

280331
749536


In [15]:
words_precoup['proportion'] = words_precoup['count']/total_precoup
words_postcoup['proportion'] = words_postcoup['count']/total_postcoup

In [16]:
words_precoup.head(20)

Unnamed: 0,count,proportion
bir,6569,0.023433
bu,3485,0.012432
ne,2935,0.01047
ben,2826,0.010081
beni,2648,0.009446
sen,2409,0.008593
seni,2096,0.007477
bana,1641,0.005854
gibi,1560,0.005565
de,1552,0.005536


In [17]:
words_postcoup.head(20)

Unnamed: 0,count,proportion
bir,16505,0.02202
bu,9736,0.012989
ben,8011,0.010688
ne,6864,0.009158
sen,6638,0.008856
beni,6412,0.008555
seni,4894,0.006529
de,4163,0.005554
bana,4132,0.005513
gibi,4036,0.005385


In [18]:
word_table = words_precoup.merge(words_postcoup, left_index=True, right_index=True, suffixes = ('_pre', '_post'))

In [19]:
word_table.head()

Unnamed: 0,count_pre,proportion_pre,count_post,proportion_post
bir,6569,0.023433,16505,0.02202
bu,3485,0.012432,9736,0.012989
ne,2935,0.01047,6864,0.009158
ben,2826,0.010081,8011,0.010688
beni,2648,0.009446,6412,0.008555


In [20]:
print(len(df['text'].unique()))

15175


In [21]:
df.shape

(15385, 6)

***Even though I scrubbed and scrubbed, there's still duplicates (same lyrics, different singers) which is probably inevitable***

In [22]:
df['text'].value_counts()

Başın öne eğilmesin Aldırma gönül aldırma Ağladığın duyulmasın Aldırma gönül aldırma Dışarıda deli dalgalar Gelip duvarları yalar Seni bu sesler oyalar Aldırma gönül aldırma Kurşun ata ata biter Yollar gide gide biter Mapus yata yata biter Aldırma gönül aldırma Dertlerin kalkınca şaha Bir sitem yolla Allaha Görecek günler var daha Aldırma gönül aldırma                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      