In [127]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [128]:
!pip install torchvision
import torch
import torchvision

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [129]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import re
import nltk
nltk.download('stopwords')
import time
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer
from sklearn.utils import shuffle


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [131]:
train_df = pd.read_csv('/content/train.csv', encoding="ISO-8859-1", header=None)
train_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']

test_df = pd.read_csv('/content/test.csv', encoding="ISO-8859-1", header=None)
test_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']

In [132]:
word_bank = []

# Function to remove predefined stopwords to reduce disk usage
def preprocess(text):
    review = re.sub('[^a-zA-Z]',' ',text) 
    review = review.lower()
    review = review.split()
    ps = LancasterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    return ' '.join(review)

In [133]:
train_df = shuffle(train_df,random_state=2)
train_df = train_df[1:50000]

In [134]:
train_df['polarity'].value_counts()

4    25093
0    24906
Name: polarity, dtype: int64

In [135]:
train_df['polarity'] = train_df['polarity'].replace(4,1)
train_df

Unnamed: 0,polarity,id,date,query,user,tweet
408251,0,2059364084,Sat Jun 06 16:47:03 PDT 2009,NO_QUERY,MrsAmarieB,my baby's growing up
1559739,1,2186151891,Mon Jun 15 18:25:49 PDT 2009,NO_QUERY,epallaviccini,Painted Black-Rolling Stones..the best!
571248,0,2208723981,Wed Jun 17 09:33:02 PDT 2009,NO_QUERY,Kiwitabby,"kk, i'm logging off now BYEZZ!"
524639,0,2193564503,Tue Jun 16 08:37:46 PDT 2009,NO_QUERY,annaqui,Shitty shitty shitty news today
311150,0,2001240587,Tue Jun 02 00:15:30 PDT 2009,NO_QUERY,smellyocheese,@Askmewhats * hugs* what's wrong?
...,...,...,...,...,...,...
1402388,1,2054703436,Sat Jun 06 07:55:06 PDT 2009,NO_QUERY,misslilamae,"@tenishae26 hey, your new icon picture is look..."
452328,0,2069991159,Sun Jun 07 16:54:21 PDT 2009,NO_QUERY,poetictitlewave,I am MOST DEFINITELY a cotton-headed ninny mug...
983247,1,1834278721,Mon May 18 02:19:09 PDT 2009,NO_QUERY,jonoabroad,. @JonoH be careful with your bot - it's easy ...
1480049,1,2066790581,Sun Jun 07 11:10:36 PDT 2009,NO_QUERY,celineaura,should sleep NOW. LTO laterrr. i love friendst...


In [136]:
test_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,2,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [137]:
test_df['polarity'] = test_df['polarity'].replace(2,1)
test_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,1,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [138]:
test_df['polarity'] = test_df['polarity'].replace(4,1)
test_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,1,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,1,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,1,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,1,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,1,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,1,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,1,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [139]:
X_train = train_df['tweet'].apply(lambda x: preprocess(x))

In [140]:
y_train = train_df['polarity']
le = LabelEncoder()
y = le.fit_transform(y_train)

In [141]:
X_test = test_df['tweet']
y_test = test_df['polarity']

In [148]:
tfidf = TfidfVectorizer(max_features = 100)
X_train_tf = tfidf.fit_transform(X_train).toarray() 
X_test = tfidf.transform(test_data).toarray()

In [149]:
X_train_tf.shape, X_test.shape, y_train.shape, y_test.shape

((49999, 100), (6, 100), (49999,), (498,))

XGB Classifier

In [150]:
xg = xgb.XGBClassifier()
start_time = time.time()
xg.fit(X_train_tf, y_train)  
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 11.813103199005127 secs


Random Forest Classifier

In [152]:
rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
start_time = time.time()
rf.fit(X_train_tf, y_train)
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 3.773947238922119 secs
