# Table of Contents

### I. Loading and Preprocessing Data
### II. Creating Text Representations
> ##### 1. Bag Of Words
> ##### 2. TF-IDF

# I. Loading and Preprocessing Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Import library
import numpy as np
import pandas as pd

In [3]:
# Read dataset
df = pd.read_csv(r'/content/drive/My Drive/tweets.csv')

In [4]:
# Print dataset
df.head()

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,RT @rssurjewala: Critical question: Was PayTM ...,False,0.0,,2016-11-23 18:40:30,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331.0,True,False
1,RT @Hemant_80: Did you vote on #Demonetization...,False,0.0,,2016-11-23 18:40:29,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66.0,True,False
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0.0,,2016-11-23 18:40:03,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12.0,True,False
3,RT @ANI_news: Gurugram (Haryana): Post office ...,False,0.0,,2016-11-23 18:39:59,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",deeptiyvd,338.0,True,False
4,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0.0,,2016-11-23 18:39:39,False,,8.014954e+17,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120.0,True,False


In [5]:
# Only keep text column
df.drop(df.columns[1:], axis=1, inplace=True)

In [6]:
# Print dataset
df.head()

Unnamed: 0,text
0,RT @rssurjewala: Critical question: Was PayTM ...
1,RT @Hemant_80: Did you vote on #Demonetization...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...
4,RT @satishacharya: Reddy Wedding! @mail_today ...


# II. Creating Text Representations

## Bag of Words

In [7]:
# Import BoW function from sklearn
from sklearn.feature_extraction.text import CountVectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [8]:
# Creating an object for the vectorizer
word_bow = CountVectorizer()

In [9]:
# Fit on training data
word_bow.fit(df['text'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
# Transform the training data
word_vectors_bow = word_bow.transform(df['text'].values)

In [11]:
# Features
word_bow.get_feature_names()

['00',
 '000',
 '00716',
 '0080',
 '0081',
 '0082',
 '0083',
 '0084',
 '0085',
 '0086',
 '0087',
 '0088',
 '0089',
 '008a',
 '008b',
 '008c',
 '008d',
 '008e',
 '008f',
 '0090',
 '0091',
 '0092',
 '0094',
 '0095',
 '0097',
 '0098',
 '0099',
 '009a',
 '009b',
 '009c',
 '009d',
 '00a0',
 '00a1',
 '00a2',
 '00a3',
 '00a4',
 '00a5',
 '00a7',
 '00a8',
 '00a9',
 '00aa',
 '00ab',
 '00ad',
 '00ae',
 '00af',
 '00b0',
 '00b1',
 '00b2',
 '00b3',
 '00b4',
 '00b6',
 '00b7',
 '00b8',
 '00b9',
 '00bb',
 '00bc',
 '00bd',
 '00be',
 '00bf',
 '01',
 '01zwcmfzca',
 '0276_gaubert',
 '02rmaokyui',
 '03',
 '04',
 '046ba2opmi',
 '05',
 '086f5vo1ce',
 '0902',
 '0905',
 '0906',
 '090f',
 '0915',
 '0917',
 '091a',
 '091c',
 '091f',
 '0920',
 '0921',
 '0922',
 '0923',
 '0924',
 '0925',
 '0926',
 '0928',
 '092a',
 '092b',
 '092c',
 '092d',
 '092e',
 '092f',
 '0930',
 '0932',
 '0933',
 '0935',
 '0936',
 '0937',
 '0938',
 '0939',
 '093c',
 '093e',
 '093f',
 '0940',
 '0941',
 '0942',
 '0943',
 '0947',
 '0948',
 '094b

In [12]:
# Shape of the matrix
# 5157 documents and 13541 unique words
word_vectors_bow

<5157x13541 sparse matrix of type '<class 'numpy.int64'>'
	with 86437 stored elements in Compressed Sparse Row format>

In [13]:
# Document representation
vocab = word_bow.get_feature_names()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab)

Unnamed: 0,00,000,00716,0080,0081,0082,0083,0084,0085,0086,0087,0088,0089,008a,008b,008c,008d,008e,008f,0090,0091,0092,0094,0095,0097,0098,0099,009a,009b,009c,009d,00a0,00a1,00a2,00a3,00a4,00a5,00a7,00a8,00a9,...,znnpdq4dpi,znwtkish3e,zobb9jutev,zoejmyc4ce,zomatoin,zone,zpaxklxkbm,zphcgmsmk1,zpsanfcvxr,zqblc100i9,zql2rbfpvw,zr74erdntc,zrch1emheh,zrputefedf,zsavptnvnc,zsbojzle9p,zscvpust86,zskczjcce2,zt4p0fwf2y,ztznefjaz3,zucosqdqib,zut2n5mojv,zuxxkunx7o,zv3ixylhci,zve7myt04g,zvsucykeky,zvup54iah5,zw9stpgpr6,zwpql5frwn,zxhhmuwceq,zxiusza2s7,zxuecwobqp,zyitjkbklc,zylu2al27f,zymrlzofxm,zyuakjdi4h,zz0mflmpfd,zzh5moxrtq,zzthdwqbfy,zzyjzzuhlu
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Reducing sparsity

### 1. Preprocessing document text

In [14]:
# Import relevant libraries
import spacy
import re

# Load English language model
nlp = spacy.load('en_core_web_sm')

In [15]:
# Preprocessing function
def clean(text):
    
    # Remove alphabetic words
    text = ' '.join(re.compile(r'[^a-zA-Z]+').split(text))

    # Create spacy object
    doc = nlp(text)

    # List to store clean text
    filtered_text = []

    # Iterate over document and save word lemmas
    for token in doc:
        filtered_text.append(token.lemma_)
    
    return " ".join(word for word in filtered_text)

In [16]:
# Apply function
df['text_clean'] = df['text'].apply(clean)

In [17]:
# Print dataset
df.head(10)

Unnamed: 0,text,text_clean
0,RT @rssurjewala: Critical question: Was PayTM ...,RT rssurjewala Critical question be PayTM info...
1,RT @Hemant_80: Did you vote on #Demonetization...,RT Hemant do -PRON- vote on demonetization on ...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",RT roshankar former FinSec rbi Dy Governor CBD...
3,RT @ANI_news: Gurugram (Haryana): Post office ...,RT ANI news Gurugram Haryana Post office emplo...
4,RT @satishacharya: Reddy Wedding! @mail_today ...,RT satishacharya Reddy Wedding mail today cart...
5,@DerekScissors1: Indias #demonetization: #Bla...,DerekScissors India s demonetization Blackmo...
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,RT gauravcsawant Rs lakh loot from a bank in K...
7,RT @Joydeep_911: Calling all Nationalists to j...,RT Joydeep call all Nationalists to join Walk ...
8,RT @sumitbhati2002: Many opposition leaders ar...,RT sumitbhati many opposition leader be with n...
9,National reform now destroyed even the essence...,national reform now destroy even the essence o...


In [18]:
# Arguments: default values
word_bow = CountVectorizer(binary=False,  # Count the occurances of the terms
                           lowercase=True,  # Lowercase
                           )

In [19]:
# Fit and transform training data
word_vectors_bow = word_bow.fit_transform(df['text_clean'].values)

In [20]:
# Shape of the matrix
word_vectors_bow

<5157x12794 sparse matrix of type '<class 'numpy.int64'>'
	with 85915 stored elements in Compressed Sparse Row format>

In [21]:
# Document representation
vocab = word_bow.get_feature_names()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab)

Unnamed: 0,aa,aaadhar,aaanupriyaaa,aadhaar,aadhar,aadhe,aadityagautom,aadmi,aagr,aaj,aajtak,aakashhindocha,aakroshdin,aam,aamaaadmiparty,aamaadami,aamaadmi,aamaadmiparty,aamadmy,aamir,aamirkhan,aap,aapakhi,aapgujarat,aapharsh,aapians,aapkadepositgaya,aapkarajiv,aapko,aaplogical,aapnewsalert,aappoojapandey,aaptard,aapved,aapvind,aaq,aaqzvhgi,aartic,aartitikoo,aashabisht,...,ztznefjaz,zu,zucosqdqib,zue,zut,zuxxkunx,zv,zvd,zve,zvh,zvqiqo,zvr,zvsucykeky,zvt,zvup,zvvbjg,zw,zwcmfzca,zwdk,zwj,zwpql,zwsoa,zxhhmuwceq,zxiusza,zxuecwobqp,zy,zyf,zyitjkbklc,zylashz,zylu,zymrlzofxm,zynql,zyuakjdi,zz,zzdxhds,zzh,zzl,zzthdwqbfy,zzygw,zzyjzzuhlu
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### 2. Keep top frequent terms

In [22]:
# Update arguments
word_bow = CountVectorizer(binary=False,  # Count the occurances of the terms
                           lowercase=True,  # Lowercase
                           max_features=5000,  # Max features
                           )

In [23]:
# Fit and transform training data
word_vectors_bow = word_bow.fit_transform(df['text_clean'].values)

In [24]:
# Shape of the matrix
word_vectors_bow

<5157x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 77970 stored elements in Compressed Sparse Row format>

In [25]:
# Document representation
vocab = word_bow.get_feature_names()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab)

Unnamed: 0,aa,aadhaar,aadhar,aadhe,aadmi,aajtak,aam,aamaadmi,aamaadmiparty,aamir,aamirkhan,aap,aapharsh,aapkadepositgaya,aaplogical,aaptard,aartic,aartitikoo,aata,ab,abby,abdul,abdulsaleemgad,abhi,abhic,abhijit,abhimay,abhishek,abhisingh,ability,abk,able,about,above,abpnewshindi,absolute,absolutely,abt,abuse,ac,...,yv,yvm,yw,zakka,zc,zcfh,zd,zeebusiness,zeenews,zeenewshindi,zeenewssports,zennie,zero,zerohedge,zetkrqqclx,zf,zh,zig,zijqntudpm,zindabad,zjp,zk,zkwjcb,zl,zlfw,zm,znvyojsu,zo,zone,zr,zt,zu,zv,zvup,zvvbjg,zwcmfzca,zxhhmuwceq,zyitjkbklc,zylu,zz
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### 3. Thresholding the occurrence of terms

In [26]:
# Update arguments
word_bow = CountVectorizer(binary=False,  # Count the occurrences of the terms
                           lowercase=True,  # Lowercase
                           max_df=500,  # Max occurrence
                           min_df=10,  # Min occurrence
                           )

In [27]:
# Fit and transform training data
word_vectors_bow = word_bow.fit_transform(df['text_clean'].values)

In [28]:
# Shape of the matrix
word_vectors_bow

<5157x953 sparse matrix of type '<class 'numpy.int64'>'
	with 39380 stored elements in Compressed Sparse Row format>

In [29]:
# Document representation
vocab = word_bow.get_feature_names()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab) 

Unnamed: 0,aa,aadhaar,aadmi,aam,aamaadmiparty,aap,able,about,abt,accept,account,achieve,acjlive,across,act,action,actually,ad,add,address,adsense,adsensegate,advertiser,affect,after,again,against,agenda,agree,ahead,ahmedabad,all,allow,almost,already,also,always,amazing,amid,amit,...,what,when,where,whether,which,while,white,who,whole,whose,why,will,win,with,withdraw,withdrawal,withdrawupto,without,woman,wonder,word,work,world,worry,worth,would,wow,write,wrong,yeah,year,yes,yet,yogi,young,youtube,youtuber,yrdeshmukh,yt,zone
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### 4. N-gram BoW

In [30]:
# Update arguments
word_bow = CountVectorizer(binary=False,  # Count the occurances of the terms
                           lowercase=True,  # Lowercase
                           ngram_range=(2,2)  # bi-gram
                           )

In [31]:
# Fit and transform training data
word_vectors_bow = word_bow.fit_transform(df['text_clean'].values)

In [32]:
# Shape of the matrix
word_vectors_bow

<5157x41883 sparse matrix of type '<class 'numpy.int64'>'
	with 85476 stored elements in Compressed Sparse Row format>

In [33]:
# Features
word_bow.get_feature_names()

['aa gaye',
 'aa https',
 'aa lfy',
 'aa lvsy',
 'aa mazc',
 'aa pje',
 'aa rahe',
 'aa to',
 'aa yogi',
 'aaadhar expansion',
 'aaanupriyaaa dear',
 'aadhaar amp',
 'aadhaar and',
 'aadhaar be',
 'aadhaar biometric',
 'aadhaar demonetization',
 'aadhaar evm',
 'aadhaar first',
 'aadhaar https',
 'aadhaar line',
 'aadhaar live',
 'aadhaar platform',
 'aadhaar slave',
 'aadhaar this',
 'aadhaar will',
 'aadhar and',
 'aadhar could',
 'aadhar demonetization',
 'aadhar necessary',
 'aadhar now',
 'aadhe kehte',
 'aadityagautom assam',
 'aadmi aap',
 'aadmi be',
 'aadmi demonetization',
 'aadmi jandhan',
 'aadmi listen',
 'aadmi punish',
 'aadmi support',
 'aagr mb',
 'aaj kiske',
 'aajtak demonetization',
 'aajtak pron',
 'aajtak zeenewshindi',
 'aakashhindocha blog',
 'aakroshdin demonetization',
 'aam aadmi',
 'aamaaadmiparty mamata',
 'aamaadami at',
 'aamaadmi be',
 'aamaadmi shri',
 'aamaadmiparty also',
 'aamaadmiparty arvindkejriwal',
 'aamaadmiparty catch',
 'aamaadmiparty demonst

In [34]:
# Document representation
vocab = word_bow.get_feature_names()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab)

Unnamed: 0,aa gaye,aa https,aa lfy,aa lvsy,aa mazc,aa pje,aa rahe,aa to,aa yogi,aaadhar expansion,aaanupriyaaa dear,aadhaar amp,aadhaar and,aadhaar be,aadhaar biometric,aadhaar demonetization,aadhaar evm,aadhaar first,aadhaar https,aadhaar line,aadhaar live,aadhaar platform,aadhaar slave,aadhaar this,aadhaar will,aadhar and,aadhar could,aadhar demonetization,aadhar necessary,aadhar now,aadhe kehte,aadityagautom assam,aadmi aap,aadmi be,aadmi demonetization,aadmi jandhan,aadmi listen,aadmi punish,aadmi support,aagr mb,...,zone sagarikaghose,zone shashitharoor,zone shekhargupta,zone shekharkapur,zone srbachchan,zone understand,zone virsanghvi,zpjwgfkk podcast,zqblc pron,zql rbfpvw,zr erdntc,zrch emheh,zrmemz htt,zrxkqg https,zrxl gyw,zszn https,zt fvxw,zt fwf,zt nznruf,ztxsq jl,zu xrq,zut mojv,zv ixylhci,zvd via,zve myt,zvh youtube,zvr kj,zvsucykeky https,zvup iah,zw stpgpr,zwpql frwn,zwsoa google,zyitjkbklc via,zylu al,zymrlzofxm https,zz al,zz mflmpfd,zzh moxrtq,zzl offa,zzygw em
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## TF-IDF

In [35]:
# Import tfidf vectorizer function from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [36]:
# Creating an object for the vectorizer
word_tfidf = TfidfVectorizer()

In [37]:
# Fit and transform training data
word_vectors_tfidf = word_tfidf.fit_transform(df['text_clean'].values)

In [38]:
# Shape of the matrix
# Same number of features
word_vectors_tfidf

<5157x12794 sparse matrix of type '<class 'numpy.float64'>'
	with 85915 stored elements in Compressed Sparse Row format>

In [39]:
# Document representation
vocab = word_tfidf.get_feature_names()
pd.DataFrame(word_vectors_tfidf.toarray(), columns=vocab)

Unnamed: 0,aa,aaadhar,aaanupriyaaa,aadhaar,aadhar,aadhe,aadityagautom,aadmi,aagr,aaj,aajtak,aakashhindocha,aakroshdin,aam,aamaaadmiparty,aamaadami,aamaadmi,aamaadmiparty,aamadmy,aamir,aamirkhan,aap,aapakhi,aapgujarat,aapharsh,aapians,aapkadepositgaya,aapkarajiv,aapko,aaplogical,aapnewsalert,aappoojapandey,aaptard,aapved,aapvind,aaq,aaqzvhgi,aartic,aartitikoo,aashabisht,...,ztznefjaz,zu,zucosqdqib,zue,zut,zuxxkunx,zv,zvd,zve,zvh,zvqiqo,zvr,zvsucykeky,zvt,zvup,zvvbjg,zw,zwcmfzca,zwdk,zwj,zwpql,zwsoa,zxhhmuwceq,zxiusza,zxuecwobqp,zy,zyf,zyitjkbklc,zylashz,zylu,zymrlzofxm,zynql,zyuakjdi,zz,zzdxhds,zzh,zzl,zzthdwqbfy,zzygw,zzyjzzuhlu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244017,0.0,0.0,0.0,0.0,0.0,0.244017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modifying argument values

In [40]:
# Update arguments
word_tfidf = TfidfVectorizer(ngram_range=(2,2))  # bi-grams

In [41]:
# Fit and transform training data
word_vectors_tfidf = word_tfidf.fit_transform(df['text_clean'].values)

In [42]:
# Shape of the matrix
word_vectors_tfidf

<5157x41883 sparse matrix of type '<class 'numpy.float64'>'
	with 85476 stored elements in Compressed Sparse Row format>

In [43]:
vocab = word_tfidf.get_feature_names()
pd.DataFrame(word_vectors_tfidf.toarray(), columns=vocab)

Unnamed: 0,aa gaye,aa https,aa lfy,aa lvsy,aa mazc,aa pje,aa rahe,aa to,aa yogi,aaadhar expansion,aaanupriyaaa dear,aadhaar amp,aadhaar and,aadhaar be,aadhaar biometric,aadhaar demonetization,aadhaar evm,aadhaar first,aadhaar https,aadhaar line,aadhaar live,aadhaar platform,aadhaar slave,aadhaar this,aadhaar will,aadhar and,aadhar could,aadhar demonetization,aadhar necessary,aadhar now,aadhe kehte,aadityagautom assam,aadmi aap,aadmi be,aadmi demonetization,aadmi jandhan,aadmi listen,aadmi punish,aadmi support,aagr mb,...,zone sagarikaghose,zone shashitharoor,zone shekhargupta,zone shekharkapur,zone srbachchan,zone understand,zone virsanghvi,zpjwgfkk podcast,zqblc pron,zql rbfpvw,zr erdntc,zrch emheh,zrmemz htt,zrxkqg https,zrxl gyw,zszn https,zt fvxw,zt fwf,zt nznruf,ztxsq jl,zu xrq,zut mojv,zv ixylhci,zvd via,zve myt,zvh youtube,zvr kj,zvsucykeky https,zvup iah,zw stpgpr,zwpql frwn,zwsoa google,zyitjkbklc via,zylu al,zymrlzofxm https,zz al,zz mflmpfd,zzh moxrtq,zzl offa,zzygw em
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230569,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
