In [28]:
import re
import pickle
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
train_data = pickle.load(open('merged_training.pkl', 'rb'))
train_data = pd.DataFrame(train_data)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 416809 entries, 27383 to 64703
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   text      416809 non-null  object
 1   emotions  416809 non-null  object
dtypes: object(2)
memory usage: 9.5+ MB


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 416809 entries, 27383 to 64703
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   text      416809 non-null  object
 1   emotions  416809 non-null  object
dtypes: object(2)
memory usage: 9.5+ MB


In [4]:
train_data['emotions'].value_counts()

joy         141067
sadness     121187
anger        57317
fear         47712
love         34554
surprise     14972
Name: emotions, dtype: int64

In [5]:
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())

In [6]:
train_data['processed_tweets'] = train_data['text'].apply(process_tweet)

In [7]:
train_data['processed_tweets'] = train_data['text'].apply(process_tweet)

In [8]:
train_data.head(1000)

Unnamed: 0,text,emotions,processed_tweets
27383,i feel awful about it too because it s my job ...,sadness,i feel awful about it too because it s my job ...
110083,im alone i feel awful,sadness,im alone i feel awful
140764,ive probably mentioned this before but i reall...,joy,ive probably mentioned this before but i reall...
100071,i was feeling a little low few days back,sadness,i was feeling a little low few days back
2837,i beleive that i am much more sensitive to oth...,love,i beleive that i am much more sensitive to oth...
...,...,...,...
4109,i am doing now is not gods will so i will not ...,sadness,i am doing now is not gods will so i will not ...
9994,i am not feeling all that creative today,joy,i am not feeling all that creative today
119429,i was feeling increasingly confident that not ...,joy,i was feeling increasingly confident that not ...
44055,i feel like my mind is so blank,sadness,i feel like my mind is so blank


In [12]:
train_data.drop(['text'],inplace=True,axis=1)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(train_data["processed_tweets"],train_data["emotions"], test_size = 0.2, random_state = 42)

In [14]:
count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

In [15]:
x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

In [16]:
print(x_train_counts.shape)
print(x_train_tfidf.shape)

(333447, 67406)
(333447, 67406)


In [17]:
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

In [18]:
print(x_test_counts.shape)
print(x_test_tfidf.shape) 

(83362, 67406)
(83362, 67406)


# Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
model1=LogisticRegression()
model1.fit(x_train_tfidf,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [20]:
predLR=model1.predict(x_test_tfidf)

In [21]:
from sklearn.metrics import confusion_matrix,f1_score
c_m = confusion_matrix(y_test,predLR)
print(c_m)

[[10229   346   226    34   487    17]
 [  388  7969   183    25   456   355]
 [  158   135 26404  1041   391   118]
 [   33    18  1556  5174    64     8]
 [  664   416   554    91 22726    53]
 [   19   536   255     8    65  2160]]


In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predLR)

0.8956359012499701

In [29]:
# Save the model as a pickle in a file
joblib.dump(model1, 'lr_model.pkl')

['lr_model.pkl']

# Decision Tree 

In [30]:
from sklearn.tree import DecisionTreeClassifier
model2=DecisionTreeClassifier()
model2.fit(x_train_tfidf,y_train)

DecisionTreeClassifier()

In [31]:
predDT=model2.predict(x_test_tfidf)

In [32]:
c_m = confusion_matrix(y_test,predDT)
print(c_m)

[[ 9868   499   169    48   734    21]
 [  564  7469   140    22   595   586]
 [  263   265 24612  1950   879   278]
 [   42    33  2122  4520   119    17]
 [  987   783   792   185 21673    84]
 [   13   788   293    13    68  1868]]


In [33]:
accuracy_score(y_test,predDT)

0.8398310981022529

In [34]:
joblib.dump(model2, 'dt_model.pkl')

['dt_model.pkl']

# Testing on new data

In [62]:
test_data = pd.read_csv('test_tweets.csv')

In [63]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   text    17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB


In [64]:
test_data_copy = test_data.copy()

In [65]:
test_data['processed_tweet'] = test_data['text'].apply(process_tweet)

In [66]:
test_data.head()

Unnamed: 0,id,text,processed_tweet
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...
1,31964,@user #white #supremacists want everyone to s...,white supremacists want everyone to see the ne...
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystoheal heal...
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd bihday to my amazing hilarious nephew eli ...


In [68]:
test_data.drop(['text'],inplace=True,axis=1)

In [69]:
test_counts = count_vect.transform(test_data['processed_tweet'])
test_tfidf = transformer.transform(test_counts)

In [70]:
lr_model = joblib.load('lr_model.pkl')

In [71]:
predictions = lr_model.predict(test_tfidf)

In [72]:
final_result = pd.DataFrame({'tweet':test_data_copy['text'],'label':predictions})

In [74]:
final_result.to_csv('output.csv',index=False)