In [1]:
import numpy as np
import pandas as pd
import sklearn
from pyspark.ml import Pipeline

train = pd.read_csv("training.csv", encoding = 'ISO 8859-1', header = None)
test = pd.read_csv("testdata.csv", header = None)

train.head()
test.head()

Unnamed: 0,0,1,2,3,4,5
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [2]:
train = train.append(test, ignore_index = True)
train.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
train = train.drop(train.columns[[1,2,3,4]], axis = 1)
train.columns = ['label', 'tweet']
train.head()

Unnamed: 0,label,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [4]:
pd.isnull(train).sum()

label    0
tweet    0
dtype: int64

In [5]:
train_head = train.iloc[0:50001,0:4]
train_tail = train.iloc[-50000:-1,0:4]
train_comb = pd.concat([train_head,train_tail])
train_comb = train_comb.reset_index(drop=True)

In [6]:
train_comb.shape

(100000, 2)

In [7]:
pd.isnull(train_comb).sum()

label    0
tweet    0
dtype: int64

In [8]:
train_comb.tail(3)

Unnamed: 0,label,tweet
99997,0,"On that note, I hate Word. I hate Pages. I hat..."
99998,4,Ahhh... back in a *real* text editing environm...
99999,0,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [9]:
print(sum(train_comb["label"] == 0))
print(sum(train_comb["label"] == 4))
print(sum(train_comb["label"] == 2))

50177
49684
139


In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer

spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()

In [11]:
import re


REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [12]:
import preprocessor as p

def clean_tweets(df):
  tempArr = []
  for line in df:
    # send to tweet_processor
    tmpL = p.clean(line)
    # remove puctuation
    tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower()) # convert all tweets to lower cases
    tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
    tempArr.append(tmpL)
  return tempArr

train_tweet = clean_tweets(train_comb["tweet"])
train_tweet = pd.DataFrame(train_tweet)

In [13]:
train_comb["clean_tweet"] = train_tweet

In [14]:
train_comb.head()

Unnamed: 0,label,tweet,clean_tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww thats a bummer you shoulda got david ca...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...


In [15]:
train_comb.tail()

Unnamed: 0,label,tweet,clean_tweet
99995,4,"After using LaTeX a lot, any other typeset mat...",after using latex a lot any other typeset math...
99996,2,Ask Programming: LaTeX or InDesign?: submitted...,ask programming latex or indesign submitted by...
99997,0,"On that note, I hate Word. I hate Pages. I hat...",on that note i hate word i hate pages i hate l...
99998,4,Ahhh... back in a *real* text editing environm...,ahhh back in a *real* text editing environment...
99999,0,"Trouble in Iran, I see. Hmm. Iran. Iran so far...",trouble in iran i see hmm iran iran so far away


In [16]:
from sklearn.model_selection import train_test_split

y = train_comb.label.values
x_train, x_test, y_train, y_test = train_test_split(train_comb.clean_tweet.values, y, 
                                                    stratify=y, 
                                                    random_state=101, 
                                                    test_size=0.3, shuffle=True)

In [17]:
x_test

array(['is hearing family drama coming from the other room',
       'has had a rethink about todays plans after checking her account  looks like its just housework today then',
       'leg is crippled like really cant walk well so sad lovee the new tattoo though',
       ..., 'cant im already going out',
       'omg that made me thinking yeah youre right never mistake attention with affection',
       'man im so sad right now can i get a few more hours of live mixing'],
      dtype=object)

In [18]:
x_train

array(['omg why half  i fort i was bad when i got up at lmfao',
       'depressed that on such a sunny day the only bit of it i will see is the two minute walk from victoria underground',
       'sports betting are a wise investment and with the right tips you multiply your money constantly wwwwinplaceat',
       ...,
       'im not able to attend today as i have a tele video conf scheduled tonight',
       'im just tired', 'writing my report for uni'], dtype=object)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

vectorizer = CountVectorizer(binary=True, stop_words='english')
vectorizer.fit(list(x_train) + list(x_test))

x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [None]:
from sklearn import svm

svm = svm.SVC(kernel = 'linear', probability=True)
prob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)
y_pred_svm = svm.predict(x_test_vec)

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy score for SVC is: ", accuracy_score(y_test, y_pred_svm) * 100, '%')

In [None]:
x_test_vec