In [1]:
import numpy as np
import pandas as pd
import pickle
import re
import os
from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
DATA_DIRECTORY = 'datasets'

In [1]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [2]:
get_available_gpus()

[]

# Features


## Vocab Size

In [3]:
vocab_sizes = {"normal": "", "small": "_sm", "large": "_lg"}

VOCAB_SIZE = "large"

### Load vocabulary

In [4]:
VOCAB_DUMP = f"vocab{vocab_sizes[VOCAB_SIZE]}.p"

In [5]:
vocab = pickle.load(open(os.path.join(DATA_DIRECTORY, VOCAB_DUMP), "rb"))

### Load Corpus

In [6]:
PREPROCESSED_CORPUS_DUMP = "preprocessed_corpus.p"

In [7]:
corpus = pickle.load(open(os.path.join(DATA_DIRECTORY, PREPROCESSED_CORPUS_DUMP), "rb"))

In [8]:
corpus[0]

'woman complain clean hous man trash'

### Load Combined Dataframe

In [9]:
DUMP_FILE = "combined_data.p"
data = pickle.load(open(os.path.join(DATA_DIRECTORY, DUMP_FILE), "rb"))

In [10]:
data = data.reset_index(drop=True)

In [11]:
data.loc[201]

class                                                    1
tweet    "@Nicholas_ted33: Kobe stay talking trash. But...
id                                                     203
Name: 201, dtype: object

### Load up tokenized tweets

In [12]:
TOKENIZED_DUMP = "tokenized_tweets.p"
tokenized = pickle.load(open(os.path.join(DATA_DIRECTORY, TOKENIZED_DUMP), "rb"))

## Bag of Words


Create the bag of words matrix with the limited vocabulary chosen from previous notebook

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1., vocabulary=list(vocab))
cv_matrix = cv.fit_transform(corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:
len(cv_matrix[0])

10373

In [15]:
# Pull in all the words that are feature names
words = cv.get_feature_names()

In [16]:
# Create the dataframe of the matrix
bow_df = pd.DataFrame(cv_matrix, columns=words)

In [17]:
bow_df.head()

Unnamed: 0,woman,complain,clean,hous,man,trash,boi,dat,cold,tyga,...,kennedi,vyapam,bred,obc,bechari,marina,gana,dhani,dadaji,<UNK>
0,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,2,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# kill the unknown column from the vocab
bow_df = bow_df.drop("<UNK>", axis=1)

In [19]:
# Merge the original data with classification into the new bag of words matrix
bow_combined = data.merge(bow_df, left_index=True, right_index=True, suffixes=('_x', ''))

In [20]:
bow_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 10375 entries, class_x to dadaji
dtypes: int64(10374), object(1)
memory usage: 3.4+ GB


In [21]:
bow_combined.loc[201]

class_x                                                     1
tweet_x     "@Nicholas_ted33: Kobe stay talking trash. But...
id_x                                                      203
woman                                                       0
complain                                                    0
                                  ...                        
bechari                                                     0
marina                                                      0
gana                                                        0
dhani                                                       0
dadaji                                                      0
Name: 201, Length: 10375, dtype: object

#### Note
We have to add the suffix to the original data since the column names are found in the vocabulary

In [22]:
bow_combined.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,masla,kennedi,vyapam,bred,obc,bechari,marina,gana,dhani,dadaji
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Save bow to disk

In [35]:
#X_train, X_test, y_train, y_test = train_test_split(bow_combined.iloc[:, 3:], bow_combined['class_x'], test_size=0.33,
#                                                   random_state=42, stratify=bow_combined['class_x'])

In [23]:
BOW_DUMP = f"bow{vocab_sizes[VOCAB_SIZE]}.p"
pickle.dump(bow_combined, open(os.path.join(DATA_DIRECTORY, BOW_DUMP), "wb"))

## TFIDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., vocabulary=list(vocab))
tv_matrix = tv.fit_transform(corpus)
tv_matrix = tv_matrix.toarray()

In [14]:
vocab = tv.get_feature_names()
tfidf_df = pd.DataFrame(tv_matrix, columns=vocab)
tfidf_df.head()

Unnamed: 0,woman,complain,clean,hous,man,trash,boi,dat,cold,tyga,...,kennedi,vyapam,bred,obc,bechari,marina,gana,dhani,dadaji,<UNK>
0,0.426186,0.486502,0.480812,0.411279,0.329805,0.269405,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.250212,0.524596,0.318435,0.384685,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# kill the unknown column from the vocab
tfidf_df = tfidf_df.drop("<UNK>", axis=1)

In [16]:
tfidf_full = data.merge(tfidf_df, left_index=True, right_index=True, suffixes=('_x', ''))

In [17]:
tfidf_full.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,masla,kennedi,vyapam,bred,obc,bechari,marina,gana,dhani,dadaji
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,0.426186,0.486502,0.480812,0.411279,0.329805,0.269405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.250212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save tfidf to disk

In [18]:
tfidf_full.iloc[201]

class_x                                                     1
tweet_x     "@Nicholas_ted33: Kobe stay talking trash. But...
id_x                                                      203
woman                                                       0
complain                                                    0
                                  ...                        
bechari                                                     0
marina                                                      0
gana                                                        0
dhani                                                       0
dadaji                                                      0
Name: 201, Length: 10375, dtype: object

In [19]:
TFIDF_DUMP = f"tfidf{vocab_sizes[VOCAB_SIZE]}.p"
pickle.dump(tfidf_full, open(os.path.join(DATA_DIRECTORY, TFIDF_DUMP), "wb"))

# WordVectors

### Load up the model

In [25]:
from gensim.models import KeyedVectors
MODEL_DUMP = "w2v.model"

In [26]:
kv = KeyedVectors.load(os.path.join(DATA_DIRECTORY, MODEL_DUMP), mmap="r")

In [22]:
tokenized[0]

['woman', 'complain', 'clean', 'hous', 'man', 'trash']

In [51]:
def average_tweet(model, tokenized_tweet):
    """
    take the trained intersect word2vec model and average it along the whole tweet list
    """
    words = [word for word in tokenized_tweet if word in kv.wv]
    if len(words) >= 1:
        return np.mean(model[words], axis=0)
    else:
        return []

In [41]:
len(tokenized)

44623

In [46]:
tweet_vectors = [average_tweet(kv, t) for t in tokenized]

  after removing the cwd from sys.path.


In [48]:
vector_df = pd.DataFrame.from_records(tweet_vectors)

In [49]:
word2vec_full = data.merge(vector_df, left_index=True, right_index=True, suffixes=(None, None))

In [50]:
word2vec_full.head()

Unnamed: 0,class,tweet,id,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,-0.317924,-0.174421,-0.045998,0.131469,0.153139,0.080083,0.584329,...,0.157028,-0.575342,-0.052679,-0.507448,-0.127183,-0.679388,-0.129394,-0.176748,-0.572039,-0.453219
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,-0.078156,-0.208762,-0.378451,-0.032828,-0.173598,-0.16706,0.252888,...,-0.108476,-0.406503,-0.200132,0.043845,0.161172,-0.056986,0.071785,-0.313311,0.081915,-0.199177
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0.220351,-0.150742,-0.16655,0.115838,-0.122622,-0.177487,0.275816,...,-0.27164,0.081974,-0.11668,0.230674,0.222533,0.020707,-0.311411,-0.388627,-0.204376,0.092579
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0.283788,-0.28506,-0.303382,0.952025,-0.227425,-0.579436,0.594272,...,-0.106376,0.018728,0.241592,0.328473,-0.488023,-0.616264,0.069447,-0.652167,-0.401744,0.058771
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,-0.427031,0.184674,-0.950997,0.251101,0.029121,-0.074548,0.771606,...,0.225774,0.212599,-0.077561,0.066444,0.108355,-0.243325,-0.123698,-0.511223,-0.314331,0.015501


In [53]:
word2vec_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 203 entries, class to 199
dtypes: float64(200), int64(2), object(1)
memory usage: 69.1+ MB


In [54]:
word2vec_full.isnull().sum()

class     0
tweet     0
id        0
0        82
1        82
         ..
195      82
196      82
197      82
198      82
199      82
Length: 203, dtype: int64

In [56]:
word2vec_full[word2vec_full[0].isnull()]

Unnamed: 0,class,tweet,id,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
950,2,&#128075; hi-ho http://t.co/FiC4FnRutZ,971,,,,,,,,...,,,,,,,,,,
2206,2,1-800-slap-a-hoe,2251,,,,,,,,...,,,,,,,,,,
2965,2,@DannyMndz93 @Titan21Mtzzz he's still a pogue ...,3037,,,,,,,,...,,,,,,,,,,
3127,2,@EricBaetsle your a greaser,3206,,,,,,,,...,,,,,,,,,,
3215,2,@FunnyPicsDepot he's a greaser,3299,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43688,2,&#128075; hi-ho http://t.co/FiC4FnRutZ,141766,,,,,,,,...,,,,,,,,,,
43716,2,Clic Ê__ https://t.co/JLF0Oi54gp Ê‰ÛÏ‰ÛÒÊ‰Û...,141834,,,,,,,,...,,,,,,,,,,
43919,2,@justinbieber you are in my journal,142308,,,,,,,,...,,,,,,,,,,
43988,2,@Swirley1 @AzTec_Ashy @TeamRetroEU he's back,142462,,,,,,,,...,,,,,,,,,,


In [57]:
word2vec_full.loc[43919]

class                                      2
tweet    @justinbieber you are in my journal
id                                    142308
0                                        NaN
1                                        NaN
                        ...                 
195                                      NaN
196                                      NaN
197                                      NaN
198                                      NaN
199                                      NaN
Name: 43919, Length: 203, dtype: object

In [60]:
average_tweet(kv, tokenized[43919])

[]

Sadly these are just not getting back a vector from the trained system we are going to drop them as they
are a small fraction of the dataset

In [61]:
word2vec_full = word2vec_full.dropna(axis=0, how='any')

In [62]:
word2vec_full[word2vec_full[0].isnull()][:5]

Unnamed: 0,class,tweet,id,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199


### Dump the word2vec dataframe

In [63]:
WORD2VECDF_DUMP = "word2vecdf.p"
pickle.dump(word2vec_full, open(os.path.join(DATA_DIRECTORY, WORD2VECDF_DUMP), "wb"))