# Import Basic Libraries

In [0]:

import pandas as pd
import numpy as np
import string 
import re
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings("ignore")
import nltk
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

# Loading the Test & Train DataSet

In [0]:
train=pd.read_csv('train_E6oV3lV.csv')
test=pd.read_csv('test_tweets_anuFYb8.csv')
submission=pd.read_csv('sample_submission_gfvA5FD.csv')

In [47]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [48]:
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [49]:
submission.head()

Unnamed: 0,id,label
0,31963,0
1,31964,0
2,31965,0
3,31966,0
4,31967,0


In [50]:
# Shape of train & test dataset accordingly

train.shape, test.shape

((31962, 3), (17197, 2))

In [0]:
# Combined the test & train dataset
df=train.append(test,ignore_index=True)

In [52]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0.0,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation


In [53]:
df.shape

(49159, 3)

# Data Preprocessing

In [0]:
# Removing punctuations
df['tweet1']=df['tweet'].str.replace('@[\w]*','')
df['tweet1'] = df['tweet1'].str.replace("[^a-zA-Z#]", " ")


In [55]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [0]:
# Function for removing punctuation
def remove_punct(text):
    text_nonpunct=''.join([char for char in text if char not in string.punctuation])
    return text_nonpunct

In [0]:
df['tweet2']=df['tweet1'].apply(lambda x:remove_punct(x))

In [58]:
df.head()

Unnamed: 0,id,label,tweet,tweet1,tweet2
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause th...,thanks for lyft credit i can t use cause the...
2,3,0.0,bihday your majesty,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...,model i love u take with u all the time in u...
4,5,0.0,factsguide: society now #motivation,factsguide society now #motivation,factsguide society now motivation


# Tokenization

In [0]:
df['token']=df['tweet2'].apply(lambda x:x.split())

In [60]:
df.token

0        [when, a, father, is, dysfunctional, and, is, ...
1        [thanks, for, lyft, credit, i, can, t, use, ca...
2                                  [bihday, your, majesty]
3        [model, i, love, u, take, with, u, all, the, t...
4                   [factsguide, society, now, motivation]
                               ...                        
49154    [thought, factory, left, right, polarisation, ...
49155    [feeling, like, a, mermaid, hairflip, neverrea...
49156    [hillary, campaigned, today, in, ohio, omg, am...
49157    [happy, at, work, conference, right, mindset, ...
49158    [my, song, so, glad, free, download, shoegaze,...
Name: token, Length: 49159, dtype: object

# Stopwords

In [61]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
stopword=nltk.corpus.stopwords.words('english')

In [0]:
def remove_stopwords(tokenized_list):
    text=[word for word in tokenized_list if word not in stopword]
    return text

In [0]:
df['nonstop']=df['token'].apply(lambda x: remove_stopwords(x))

In [65]:
df.nonstop, df.token

(0        [father, dysfunctional, selfish, drags, kids, ...
 1        [thanks, lyft, credit, use, cause, offer, whee...
 2                                        [bihday, majesty]
 3                      [model, love, u, take, u, time, ur]
 4                        [factsguide, society, motivation]
                                ...                        
 49154    [thought, factory, left, right, polarisation, ...
 49155    [feeling, like, mermaid, hairflip, neverready,...
 49156    [hillary, campaigned, today, ohio, omg, amp, u...
 49157    [happy, work, conference, right, mindset, lead...
 49158    [song, glad, free, download, shoegaze, newmusi...
 Name: nonstop, Length: 49159, dtype: object,
 0        [when, a, father, is, dysfunctional, and, is, ...
 1        [thanks, for, lyft, credit, i, can, t, use, ca...
 2                                  [bihday, your, majesty]
 3        [model, i, love, u, take, with, u, all, the, t...
 4                   [factsguide, society, now, motiva

In [0]:
from nltk.stem.wordnet import *

In [0]:
wn=WordNetLemmatizer()

In [68]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#Lemmatization

In [0]:
df['lemma']=df['nonstop'].apply(lambda x: [wn.lemmatize(i) for i in x])

In [70]:
df.lemma

0        [father, dysfunctional, selfish, drag, kid, dy...
1        [thanks, lyft, credit, use, cause, offer, whee...
2                                        [bihday, majesty]
3                      [model, love, u, take, u, time, ur]
4                        [factsguide, society, motivation]
                               ...                        
49154    [thought, factory, left, right, polarisation, ...
49155    [feeling, like, mermaid, hairflip, neverready,...
49156    [hillary, campaigned, today, ohio, omg, amp, u...
49157    [happy, work, conference, right, mindset, lead...
49158    [song, glad, free, download, shoegaze, newmusi...
Name: lemma, Length: 49159, dtype: object

In [0]:
tokenized_tweet=df['lemma']

In [30]:
df.lemma

0        [father, dysfunctional, selfish, drag, kid, dy...
1        [thanks, lyft, credit, use, cause, offer, whee...
2                                        [bihday, majesty]
3                      [model, love, u, take, u, time, ur]
4                        [factsguide, society, motivation]
                               ...                        
49154    [thought, factory, left, right, polarisation, ...
49155    [feeling, like, mermaid, hairflip, neverready,...
49156    [hillary, campaigned, today, ohio, omg, amp, u...
49157    [happy, work, conference, right, mindset, lead...
49158    [song, glad, free, download, shoegaze, newmusi...
Name: lemma, Length: 49159, dtype: object

In [0]:
#!pip install mosestokenizer

In [0]:
#from mosestokenizer import MosesTokenizer, MosesDetokenizer

In [0]:
#import MosesDetokenizer

#!pip install -q matplotlib-venn

In [0]:
import spacy

In [0]:
#from sacremoses import MosesDetokenizer

In [0]:
#df['final']=df['lemma'].apply(lambda x: detokenizer.detokenize(x))

In [0]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [79]:
df.lemma.head()

0    [father, dysfunctional, selfish, drag, kid, dy...
1    [thanks, lyft, credit, use, cause, offer, whee...
2                                    [bihday, majesty]
3                  [model, love, u, take, u, time, ur]
4                    [factsguide, society, motivation]
Name: lemma, dtype: object

In [0]:
df['final']=df['lemma'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

In [81]:
df.final

0        father dysfunctional selfish drag kid dysfunct...
1        thanks lyft credit use cause offer wheelchair ...
2                                           bihday majesty
3                              model love u take u time ur
4                            factsguide society motivation
                               ...                        
49154    thought factory left right polarisation trump ...
49155    feeling like mermaid hairflip neverready forma...
49156    hillary campaigned today ohio omg amp used wor...
49157    happy work conference right mindset lead cultu...
49158    song glad free download shoegaze newmusic newsong
Name: final, Length: 49159, dtype: object

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
count_vect=CountVectorizer(ngram_range=(1,2),max_df=0.90, min_df=2, stop_words='english')

In [0]:
X_count=count_vect.fit_transform(df['final'])

In [85]:
X_count

<49159x50778 sparse matrix of type '<class 'numpy.int64'>'
	with 427990 stored elements in Compressed Sparse Row format>

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
tf=TfidfVectorizer(ngram_range=(1,2),max_df=0.90, min_df=2, stop_words='english')

In [0]:
X_tf=tf.fit_transform(df['final'])

In [89]:
X_tf

<49159x50778 sparse matrix of type '<class 'numpy.float64'>'
	with 427990 stored elements in Compressed Sparse Row format>

# Split the DataSet into Train & Test

In [0]:
train_bow = X_count[:31962,:] 
test_bow = X_count[31962:,:]

In [0]:
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['label'],random_state=42,test_size=0.3)

#Logistic Regression

In [0]:
Lreg = LogisticRegression()

In [94]:
Lreg.fit(xtrain_bow, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
prediction = Lreg.predict_proba(xvalid_bow)

In [118]:
prediction

array([0, 0, 0, ..., 0, 1, 0])

In [0]:
prediction_int = prediction[:,1] >= 0.3

In [119]:
prediction_int

array([0, 0, 0, ..., 0, 1, 0])

In [0]:
prediction_int = prediction_int.astype(np.int)

In [121]:
prediction_int

array([0, 0, 0, ..., 0, 1, 0])

In [98]:
f1_score(yvalid, prediction_int)

0.689075630252101

#Random Forest

In [99]:
RF = RandomForestClassifier(n_estimators=400,max_features='auto', random_state=11).fit(xtrain_bow, ytrain) 
prediction = RF.predict(xvalid_bow) 
f1_score(yvalid, prediction)

0.63710407239819

In [0]:
train_tfidf = X_tf[:31962,:] 
test_tfidf = X_tf[31962:,:] 
xtrain_tfidf = train_tfidf[ytrain.index] 
xvalid_tfidf = train_tfidf[yvalid.index]

#Tfidf score


In [103]:
#Logistic Regression
Lreg.fit(xtrain_tfidf, ytrain) 
prediction = Lreg.predict_proba(xvalid_tfidf) 
prediction_int = prediction[:,1] >= 0.3 
prediction_int = prediction_int.astype(np.int) 
f1_score(yvalid, prediction_int)

0.5337361530715006

In [105]:
#Random Forest
RF = RandomForestClassifier(n_estimators=1500,max_features='auto', random_state=11).fit(xtrain_tfidf, ytrain) 
prediction = RF.predict(xvalid_tfidf) 
f1_score(yvalid, prediction)

0.629277566539924

Here, I found that f1 score for Logistic Regression is higher than Random Forest. So here I have applied Logistic Regression to the test Data Set.

In [0]:
test_pred = Lreg.predict_proba(test_bow) 


In [0]:
prediction_int1 = test_pred[:,1] >= 0.3


In [0]:
test_pred = prediction_int1.astype(np.int)

In [0]:
test_pred=pd.DataFrame(test_pred)

In [0]:

test_pred = RF.predict(test_bow)

In [0]:
test['label'] = test_pred
submission = test[['id','label']]

In [130]:
submission.head()

Unnamed: 0,id,label
0,31963,0
1,31964,1
2,31965,0
3,31966,0
4,31967,0


In [131]:
import os
os.getcwd()

'/content'

In [0]:
submission.to_csv('submission2.csv', index=False)