In [4]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split

In [5]:
# dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
# documents = dataset.data

In [7]:
df = pd.read_csv(
    filepath_or_buffer='Trump.csv',
    header=None
)

In [8]:
# CATEGORISING THE SENTIMENT TABLE
# NEUTRAL -> 0 | NEGATIVE -> 1 | POSITIVE -> 2
df[1] = df[1].astype('category')
df[1] =  df[1].cat.codes

# NUMBER OF SAMPLES FOR EACH CATEGORY
df[1].value_counts()

0    2928
1    1369
2    1303
Name: 1, dtype: int64

In [9]:
#df = pd.DataFrame({'label':dataset.target, 'text':dataset.data})
df = pd.DataFrame({'label':df[1], 'text':df[0]})
df.shape

(5600, 2)

In [10]:
# df = df[df['label'].isin([1,10])]
# df = df.reset_index(drop = True)
# df['label'].value_counts()

In [11]:
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")
df.head(5)

Unnamed: 0,label,text
0,1,Due to COVID Berks Arts Council presents th...
1,1,RT NCDCgov new cases of COVID have be...
2,1,RT NCDCgov new cases of COVID have be...
3,0,RT LizaYuzda Hey Fluevog to keep the heart...
4,2,Super super Jack


In [12]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aviralsrivastava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

In [14]:
# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.15, random_state = 12)
df_trn.shape, df_val.shape

((4760, 2), (840, 2))

In [15]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [16]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7)

In [17]:
# train the learner object with learning rate = 1e-2
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,5.764486,4.364111,0.320268,04:23


In [18]:
learn.save_encoder('ft_enc')

In [19]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)
learn.load_encoder('ft_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (4760 items)
x: TextList
xxbos xxmaj with datarobot xxunk machine learning solution researchers xxunk every step needed build deploy xxunk powerful xxup ai leading powerful predictions https co xxunk o xxup covid datarobot datarobot,xxbos xxup covid xxup covid capital capital xxmaj finance xxmaj what s xxmaj the xxmaj magic xxmaj number xxmaj in xxmaj xxunk xxmaj loans xxmaj to xxmaj save xxmaj xxunk xxmaj small xxmaj businesses via https co xxunk x,xxbos xxup rt xxmaj univ inenglish xxmaj scientists several health institutions found evidence proves xxup covid xxunk xxmaj mexico come diff,xxbos xxmaj the latest xxmaj leadership https co xxunk xxmaj thanks xxunk xxunk xxunk covid leadership,xxbos xxup rt xxmaj xxunk xxmaj does anyone else turn xxup bbc xxmaj covid press conference media questions begin xxmaj you could see faces xxmaj prof xxmaj xxunk
y: CategoryList
2,1,1,1,0
Path: .;

Valid: LabelList (840 items)
x: TextList
xxbos xx

In [20]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.894329,0.758574,0.684524,05:32


In [21]:
# get predictions
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,415,91,117
1,13,98,17
2,11,16,62


In [28]:
for i in range(1, 11):
    print(df_val[i-1:i]['text'], preds[i-1:i])

187    RT scrowder The American public NEEDS know new...
Name: text, dtype: object tensor([[0.8127, 0.0556, 0.1317]])
4325    Some us STILL even recieved Stimulusdeposit ye...
Name: text, dtype: object tensor([[0.7336, 0.0438, 0.2225]])
524    RT MDMEMA ALERT We received several calls rega...
Name: text, dtype: object tensor([[0.1897, 0.6634, 0.1469]])
5153    RT susanj Which Corrupt realDonaldTrump fired ...
Name: text, dtype: object tensor([[0.7091, 0.1511, 0.1398]])
4220    RT lewis goodall NEW I leaked covid death figu...
Name: text, dtype: object tensor([[0.8472, 0.1115, 0.0414]])
258    RT iingwen Don miss Vice President Chen videoc...
Name: text, dtype: object tensor([[0.5135, 0.2750, 0.2115]])
2094    healthgovau And excersise opinions ridiculed S...
Name: text, dtype: object tensor([[0.5121, 0.1796, 0.3082]])
4058    Thank god T V boyfriends QuarantineLife COVID
Name: text, dtype: object tensor([[0.2880, 0.1208, 0.5911]])
3822    Would buy house go guy mask That question Lol .

tensor(0.1317)

In [42]:
from pymongo import MongoClient

client = MongoClient()
db = client.phase3
file_path = "Trump.csv"
collection_name = '{}'.format(file_path.replace('.', '_'))

In [90]:
for i in range(1, len(df_val)+1):
    tweet = df_val[i-1:i]['text'].array[0]
    neutral = preds[i-1:i][0][0]
    negative = preds[i-1:i][0][1]
    positive = preds[i-1:i][0][2]
    if neutral > positive and neutral > negative:
        sentiment = 'neutral'
    elif negative > neutral and negative > positive:
        sentiment = 'negative'
    else:
        sentiment = 'positive'
    db[collection_name].insert_one(
        {
            "tweet": tweet,
            "sentiment": sentiment
        }
    )
    if i % 100 == 0:
        print(i)

    

100
200
300
400
500
600
700
800


In [86]:
len(preds)

840

In [87]:
len(df_val)

840