1. target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)  
2. ids: The id of the tweet ( 2087)  
3. date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)  
4. flag: The query (lyx). If there is no query, then this value is NO_QUERY.  
5. user: the user that tweeted (robotickilldozr)  
6. text: the text of the tweet (Lyx is cool)

In [1]:
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# sklearn packages
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

# nltk packages
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from string import punctuation
import unidecode
import csv

RANDOM_SEED = 20

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dahiy_uokx4\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dahiy_uokx4\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dahiy_uokx4\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dahiy_uokx4\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
train = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding='latin',names=["target", "id", "date", "flag", "user", "text"])

In [3]:
test = pd.read_csv('testdata.manual.2009.06.14.csv',encoding='latin',names=["target", "id", "date", "flag", "user", "text"])

In [4]:
train.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
test.head()

Unnamed: 0,target,id,date,flag,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,stellargirl loooooooovvvvvveee kindle2 dx cool...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,reading kindle2 love lee child good read
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,ok first assesment # kindle2 fucking rock
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,kenburbary youll love kindle2 ive mine month n...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,mikefish fair enough kindle2 think perfect


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [None]:
# cleaning part

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    if(text!=text):
        return ""
    text = text.lower()
    #text = re.sub("@[A-Za-z0-9]+","",text) # remove @
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text) # remove URL

    text = " ".join(text.split())
    text = text.replace("#", "").replace("_", " ") # remove '#' symbol
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    #wordnet_lemmatizer = WordNetLemmatizer()
    #text = [wordnet_lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)]

    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [7]:
# cleaning part

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    if(text!=text):
        return ""
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    wordnet_lemmatizer = WordNetLemmatizer()
    text = [wordnet_lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)]
    text = ' '.join(word for word in text if word not in STOPWORDS)
    return text

In [10]:
train['text'] = train['text'].apply(clean_text)

In [8]:
test['text'] = test['text'].apply(clean_text)

In [11]:
train.drop(['id','user','date','flag'], axis=1, inplace=True)
test.drop(['id','user','date','flag'], axis=1, inplace=True)

In [12]:
X = train['text']
y = train['target']

X_test = test['text']
y_test = test['target']

valid_size = 0.3
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=valid_size,random_state=RANDOM_SEED)

print("Training Size:- ", int((X.shape[0])*(1-valid_size)))
print("Validation Size:- ", int((X.shape[0])*(valid_size)))
print("Test Size:- ", X_test.shape[0])

Training Size:-  1120000
Validation Size:-  480000
Test Size:-  498


In [64]:
from sklearn.svm import LinearSVC
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LinearSVC()),
              ])
print(X_train.shape)
nb.fit(X_train[:200000], y_train[:200000])

(1120000,)


In [None]:
from sklearn.svm import LinearSVC
nb = Pipeline([('vect', TfidfVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LinearSVC()),
              ])
print(X_train.shape)
nb.fit(X_train[:200000], y_train[:200000])

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression(max_iter=15000)),
              ])
print(X_train.shape)
nb.fit(X_train[:], y_train[:])

(1120000,)


In [36]:
from sklearn.naive_bayes import MultinomialNB
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
print(X_train.shape)
nb.fit(X_train[:500000], y_train[:500000])

(1120000,)


In [37]:
y_pred = nb.predict(X_train[:])
print('Accuracy:-',100*accuracy_score(y_pred, y_train[:]))

Accuracy:- 80.19616071428571


In [38]:
y_pred = nb.predict(X_valid[:])
print('Accuracy:-',100*accuracy_score(y_pred, y_valid[:]))

Accuracy:- 75.93333333333334


In [65]:
y_pred = nb.predict(X_test[:])
print('Accuracy:-',100*accuracy_score(y_pred, y_test[:]))

Accuracy:- 60.44176706827309


Accuracy  
59.43 for linearSVC on 200000 | 76.49 val | count vector  
58.43 for linearSVC on 200000 | 75.55 val | tfifd vector  
59.63 for linearSVC on 200000 | 78.20 val | count vector + lemma    
59.03 for Logistic on all | 78.89 val | tfifd vector  
59.03 for Logistic on all | 78.80 val | count vector  

In [None]:
test['target'].value_counts()

In [None]:
train['id'].value_counts()

In [35]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(y_test, y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       0.68      0.86      0.76       177
           2       0.00      0.00      0.00       139
           4       0.54      0.82      0.65       182

    accuracy                           0.60       498
   macro avg       0.41      0.56      0.47       498
weighted avg       0.44      0.60      0.51       498



In [33]:
confusion_matrix(y_test,y_pred)

array([[152,   0,  25],
       [ 37,   0, 102],
       [ 33,   0, 149]], dtype=int64)

In [63]:
nb.predict(['lol'])

array([4], dtype=int64)

In [67]:
[input()]

['asdasd']