In [3]:
import numpy as np
import pandas as pd
import pickle
import re
import os
from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt

%matplotlib inline

In [7]:
DATA_DIRECTORY = 'datasets/'

# Features


### Load vocabulary

In [8]:
VOCAB_DUMP = "vocab.p"

In [9]:
vocab = pickle.load(open(os.path.join(DATA_DIRECTORY, VOCAB_DUMP), "rb"))

### Load Corpus

In [10]:
PREPROCESSED_CORPUS_DUMP = "preprocessed_corpus.p"

In [11]:
corpus = pickle.load(open(os.path.join(DATA_DIRECTORY, PREPROCESSED_CORPUS_DUMP), "rb"))

In [12]:
corpus[0]

'woman complain clean hous man trash'

### Load Combined Dataframe

In [13]:
DUMP_FILE = "combined_data.p"
data = pickle.load(open(os.path.join(DATA_DIRECTORY, DUMP_FILE), "rb"))

In [14]:
data = data.reset_index(drop=True)

In [15]:
data.loc[201]

class                                                    1
tweet    "@Nicholas_ted33: Kobe stay talking trash. But...
id                                                     203
Name: 201, dtype: object

## Bag of Words


Create the bag of words matrix with the limited vocabulary chosen from previous notebook

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1., vocabulary=list(vocab))
cv_matrix = cv.fit_transform(corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
len(cv_matrix[0])

6896

In [14]:
# Pull in all the words that are feature names
words = cv.get_feature_names()

In [15]:
# Create the dataframe of the matrix
bow_df = pd.DataFrame(cv_matrix, columns=words)

In [17]:
bow_df.head()

Unnamed: 0,woman,complain,clean,hous,man,trash,boi,dat,cold,tyga,...,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito,<UNK>
0,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,2,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# kill the unknown column from the vocab
bow_df = bow_df.drop("<UNK>", axis=1)

In [21]:
# Merge the original data with classification into the new bag of words matrix
bow_combined = data.merge(bow_df, left_index=True, right_index=True, suffixes=('_x', ''))

In [22]:
bow_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 6898 entries, class_x to pandito
dtypes: int64(6897), object(1)
memory usage: 2.3+ GB


In [23]:
bow_combined.loc[201]

class_x                                                     1
tweet_x     "@Nicholas_ted33: Kobe stay talking trash. But...
id_x                                                      203
woman                                                       0
complain                                                    0
                                  ...                        
_ia__ii                                                     0
vow                                                         0
haalat                                                      0
garib                                                       0
pandito                                                     0
Name: 201, Length: 6898, dtype: object

#### Note
We have to add the suffix to the original data since the column names are found in the vocabulary

In [24]:
bow_combined.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,kaal,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Save bow to disk

In [25]:
#X_train, X_test, y_train, y_test = train_test_split(bow_combined.iloc[:, 3:], bow_combined['class_x'], test_size=0.33,
#                                                   random_state=42, stratify=bow_combined['class_x'])

In [26]:
BOW_DUMP = "bow.p"
pickle.dump(bow_combined, open(os.path.join(DATA_DIRECTORY, BOW_DUMP), "wb"))

## TFIDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., vocabulary=list(vocab))
tv_matrix = tv.fit_transform(corpus)
tv_matrix = tv_matrix.toarray()

In [17]:
vocab = tv.get_feature_names()
tfidf_df = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)
tfidf_df.head()

Unnamed: 0,woman,complain,clean,hous,man,trash,boi,dat,cold,tyga,...,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito,<UNK>
0,0.43,0.49,0.48,0.41,0.33,0.27,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.52,0.32,0.38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# kill the unknown column from the vocab
tfidf_df = tfidf_df.drop("<UNK>", axis=1)

In [19]:
tfidf_full = data.merge(tfidf_df, left_index=True, right_index=True, suffixes=('_x', ''))

In [20]:
tfidf_full.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,kaal,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,0.43,0.49,0.48,0.41,0.33,0.27,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save tfidf to disk

In [21]:
tfidf_full.iloc[201]

class_x                                                     1
tweet_x     "@Nicholas_ted33: Kobe stay talking trash. But...
id_x                                                      203
woman                                                       0
complain                                                    0
                                  ...                        
_ia__ii                                                     0
vow                                                         0
haalat                                                      0
garib                                                       0
pandito                                                     0
Name: 201, Length: 6898, dtype: object

In [22]:
TFIDF_DUMP = "tfidf.p"
pickle.dump(tfidf_full, open(os.path.join(DATA_DIRECTORY, TFIDF_DUMP), "wb"))