In [1]:
%matplotlib inline

import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import nltk package
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support


In [2]:
# Download the NLTK English tokenizer and the stopwords of all languages
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/cbvincen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cbvincen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
directory = os.getcwd()

In [4]:
tweet_dir = os.path.join(directory,'tweets')

In [5]:
tweets = pd.read_csv(tweet_dir, encoding='latin-1')
tweets.columns = ['Polarity', 'ID', 'Date', 'Flag', 'User', 'Text']

In [6]:
tweets.drop(['Flag', 'ID'], axis=1, inplace=True)
tweets.head()

Unnamed: 0,Polarity,Date,User,Text
0,0,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
1,0,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire
3,0,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,Mon Apr 06 22:20:00 PDT 2009,joy_wolf,@Kwesidei not the whole crew


In [7]:
tweets1 = tweets[0:5000] 
tweets2 = tweets[905000:910000]

# tweets = tweets1.add(tweets2, fill_value=0)

tweets = [tweets1, tweets2]
  
tweets = pd.concat(tweets)

tweets.head()
#tweets.shape

Unnamed: 0,Polarity,Date,User,Text
0,0,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
1,0,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire
3,0,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,Mon Apr 06 22:20:00 PDT 2009,joy_wolf,@Kwesidei not the whole crew


In [8]:
# re encode positive tweets to 1 instead of 4
def encode(label):
    if label == 0:
        return 0
    elif label == 4:
        return 1

In [9]:
tweets['Polarity'] = tweets.apply(lambda x: encode(x['Polarity']), axis=1)

In [10]:
tweets.head()

Unnamed: 0,Polarity,Date,User,Text
0,0,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
1,0,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire
3,0,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,Mon Apr 06 22:20:00 PDT 2009,joy_wolf,@Kwesidei not the whole crew


In [11]:
vectorizer = CountVectorizer(analyzer='word', max_features=2000, 
                             tokenizer=word_tokenize, stop_words= stopwords.words('english'))

In [12]:
df_Y = np.array(tweets['Polarity'])
df_Y.shape

(10000,)

In [13]:
df_X = vectorizer.fit_transform(tweets["Text"]).toarray()

In [14]:
df_train_X, df_test_X, df_train_Y, df_test_Y = train_test_split(df_X, df_Y, test_size= .2, random_state= 200)

In [15]:
def train_SVM(X, y, kernel='linear'):
    clf = SVC(kernel=kernel)
    clf.fit(X, y)
    return clf

In [16]:
df_clf = train_SVM(df_train_X, df_train_Y)

In [17]:
df_predicted_train_Y = df_clf.predict(df_train_X)
df_predicted_test_Y = df_clf.predict(df_test_X)

In [18]:
print(classification_report(df_train_Y,df_predicted_train_Y))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      3997
           1       0.83      0.88      0.85      4003

    accuracy                           0.85      8000
   macro avg       0.85      0.85      0.85      8000
weighted avg       0.85      0.85      0.85      8000



# TF-IDF

In [19]:
tfidf = TfidfVectorizer(sublinear_tf=True, analyzer='word', max_features=2000,tokenizer=word_tokenize)

In [20]:
df_tfidf_X = tfidf.fit_transform(tweets['Text']).toarray()

In [22]:
df_train_tfidf_X, df_test_tfidf_X, df_train_tfidf_Y, df_test_tfidf_Y = train_test_split(df_tfidf_X, df_Y, test_size= .2, random_state= 200)

In [23]:
df_tfidf_clf = train_SVM (df_train_tfidf_X, df_train_tfidf_Y)

In [24]:
df_pred_train_tfidf_Y = df_tfidf_clf.predict(df_train_tfidf_X)
df_pred_test_tfidf_Y = df_tfidf_clf.predict(df_test_tfidf_X)

In [25]:
print(classification_report(df_train_tfidf_Y, df_pred_train_tfidf_Y))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      3997
           1       0.84      0.85      0.84      4003

    accuracy                           0.84      8000
   macro avg       0.84      0.84      0.84      8000
weighted avg       0.84      0.84      0.84      8000

