In [98]:
import pandas as pd
import re
import numpy as np
# Python program to generate word vectors using Word2Vec 
# importing all necessary modules 
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize, word_tokenize 
from xgboost import XGBClassifier
import warnings 
import nltk  
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix
warnings.filterwarnings(action = 'ignore') 
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import gensim 
from gensim.models import Word2Vec 
from imblearn.over_sampling import SMOTE
from nltk.stem.wordnet import WordNetLemmatizer
#nltk.download()

In [2]:
from keras.layers import LSTM, Convolution1D, Flatten, Dropout, Dense
from keras.layers.embeddings import Embedding
from keras.models import Sequential

Using TensorFlow backend.


In [3]:
df = pd.read_csv('./train_twitter.csv')
df_test = pd.read_csv('./test_tweets.csv')

In [4]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
df_test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


## Pre-processing

In [6]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt  

In [7]:
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*") # remove all @user
df_test['tidy_tweet'] = np.vectorize(remove_pattern)(df_test['tweet'], "@[\w]*")

In [8]:
# remove special characters, numbers, punctuations
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z]", " ")
df_test['tidy_tweet'] = df_test['tidy_tweet'].str.replace("[^a-zA-Z]", " ")

In [9]:
# keep tweets having more than 3 words in train set
df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) 

In [10]:
df.head(10)

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cause they offer wheelchair...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model love take with time
4,5,0,factsguide: society now #motivation,factsguide society motivation
5,6,0,[2/2] huge fan fare and big talking before the...,huge fare talking before they leave chaos disp...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,next school year year exams think about that s...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,love land allin cavs champions cleveland cleve...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here


In [11]:
df_test.head(10)

Unnamed: 0,id,tweet,tidy_tweet
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedic...
1,31964,@user #white #supremacists want everyone to s...,white supremacists want everyone to see th...
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystohe...
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",rd bihday to my amazing hilarious nephew...
5,31968,choose to be :) #momtips,choose to be momtips
6,31969,something inside me dies ð¦ð¿â¨ eyes nes...,something inside me dies eyes nes...
7,31970,#finished#tattoo#inked#ink#loveitâ¤ï¸ #â¤ï¸...,finished tattoo inked ink loveit ...
8,31971,@user @user @user i will never understand why...,i will never understand why my dad left me...
9,31972,#delicious #food #lovelife #capetown mannaep...,delicious food lovelife capetown mannaep...


In [12]:
 # tokenizing
tokenized_tweet = df['tidy_tweet'].apply(lambda x:  word_tokenize(x))
tokenized_tweet_test = df_test['tidy_tweet'].apply(lambda x:  word_tokenize(x))

In [13]:
# removing stop words
stop = set(stopwords.words('english'))
tokenized_tweet = tokenized_tweet.apply(lambda x : [i for i in x if i not in stop])
tokenized_tweet_test = tokenized_tweet_test.apply(lambda x : [i for i in x if i not in stop])

In [14]:
tokenized_tweet.head()

0    [father, dysfunctional, selfish, drags, kids, ...
1    [thanks, lyft, credit, cause, offer, wheelchai...
2                                    [bihday, majesty]
3                            [model, love, take, time]
4                    [factsguide, society, motivation]
Name: tidy_tweet, dtype: object

In [15]:
# Lemmetizing
lmtzr = WordNetLemmatizer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [lmtzr.lemmatize(i) for i in x]) # stemming
tokenized_tweet_test = tokenized_tweet_test.apply(lambda x: [lmtzr.lemmatize(i) for i in x])
tokenized_tweet.head()

0    [father, dysfunctional, selfish, drag, kid, dy...
1    [thanks, lyft, credit, cause, offer, wheelchai...
2                                    [bihday, majesty]
3                            [model, love, take, time]
4                    [factsguide, society, motivation]
Name: tidy_tweet, dtype: object

In [16]:
# Stemming
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet_test = tokenized_tweet_test.apply(lambda x: [stemmer.stem(i) for i in x])
tokenized_tweet.head()


0     [father, dysfunct, selfish, drag, kid, dysfunct]
1    [thank, lyft, credit, caus, offer, wheelchair,...
2                                    [bihday, majesti]
3                            [model, love, take, time]
4                          [factsguid, societi, motiv]
Name: tidy_tweet, dtype: object

In [17]:
tokenized_tweet_test

0        [studiolif, aislif, requir, passion, dedic, wi...
1        [white, supremacist, want, everyon, see, new, ...
2        [safe, way, heal, acn, altwaystoh, healthi, heal]
3        [hp, curs, child, book, reserv, alreadi, ye, h...
4        [rd, bihday, amaz, hilari, nephew, eli, ahmir,...
5                                          [choos, momtip]
6        [someth, insid, dy, eye, ness, smokeyey, tire,...
7         [finish, tattoo, ink, ink, loveit, thank, aleee]
8        [never, understand, dad, left, young, deep, in...
9        [delici, food, lovelif, capetown, mannaepicur,...
10       [dayswast, narcosi, infinit, ep, make, awar, g...
11       [one, world, greatest, spo, event, leman, team...
12                        [half, way, websit, allgoingwel]
13       [good, food, good, life, enjoy, call, garlic, ...
14       [stand, behind, guncontrolpleas, senselessshoo...
15       [ate, ate, ate, jamaisasthi, fish, curri, praw...
16             [got, limit, edit, rain, shine, set, toda

In [18]:
df['tidy_tweet'] = tokenized_tweet
df_test['tidy_tweet'] = tokenized_tweet_test

## Exctracting feature

In [19]:
feature_vector_size = 250

In [106]:
model1 = gensim.models.Word2Vec(pd.concat([df['tidy_tweet'],df_test['tidy_tweet']]), min_count = 1,  
                              size = feature_vector_size, window = 4,sg=1)  # Word2Vec trained on both train and test sets

In [107]:
def get_feature_vectors(row):
    l= np.zeros((feature_vector_size,1))
 
    for i in row:
        
        l = np.add ( l , model1.wv[i].reshape((feature_vector_size,1)) )
    avg_vec= np.divide(l,len(row))
      
    
    return avg_vec

In [108]:
train_features = df['tidy_tweet'].apply(get_feature_vectors)
test_features = df_test['tidy_tweet'].apply(get_feature_vectors)

In [109]:
train_features.shape

(31962,)

In [110]:
X = np.stack(train_features.values).reshape((-1, feature_vector_size))

In [111]:
test_X = np.stack(test_features.values).reshape((-1, feature_vector_size))

In [112]:
test_X.shape

(17197, 250)

In [113]:
y = df['label']


## Train model and predict

In [114]:
np.where(np.isnan(X))

(array([  506,   506,   506, ..., 31781, 31781, 31781]),
 array([  0,   1,   2, ..., 247, 248, 249]))

In [115]:
X = np.nan_to_num(X) # handling na values
test_X = np.nan_to_num(test_X)

In [116]:
X.shape

(31962, 250)

In [117]:
X_train = X[:25000,:] # sample train and validation sets
X_val = X[25000:,:]

In [118]:
y_train = y[:25000]
y_val = y[25000:]

In [119]:
ct_min= 23246
ct_maj= 6000

In [120]:
smt = SMOTE(ratio={0:23246,1:3400}, k_neighbors=1,random_state=0) # minority oversampling

In [121]:
smt.fit(X_train,y_train)

SMOTE(k=None, k_neighbors=1, kind='regular', m=None, m_neighbors=10, n_jobs=1,
   out_step=0.5, random_state=0, ratio={0: 23246, 1: 3400},
   svm_estimator=None)

In [122]:
X_resam,y_resam=  smt.fit_sample(X_train,y_train)

In [123]:
rf =RandomForestClassifier(random_state=0,n_estimators=25,max_features=50,class_weight={1:2242/31962,0:1})

In [124]:
#xg_cf = XGBClassifier()

In [125]:
rf.fit(X_resam,y_resam)

RandomForestClassifier(bootstrap=True,
            class_weight={1: 0.07014579813528565, 0: 1}, criterion='gini',
            max_depth=None, max_features=50, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [126]:
rf.score(X_val,y_val)

0.9495834530307383

In [127]:
confusion_matrix(y_val,rf.predict(X_val))

array([[6340,  134],
       [ 217,  271]])

In [128]:
f1_score(y_val,rf.predict(X_val))

0.606942889137738

In [101]:
X_resam.shape

(26646, 250)

In [129]:
sum(rf.predict(test_X))

923

In [130]:
pd.DataFrame(rf.predict(test_X)).to_csv('pred_twitter_smote_3k_stop_sg.csv') # write predictions to csv