In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import os

In [2]:
train_file_path = './train_data_after_pre.csv'
test_file_path = './test_data_after_pre.csv'

In [3]:
def load_csv(file_path):
    data = pd.read_csv(file_path)
    data.head()
    X = data.content
    Y = data.target
    return X, Y

X, Y = load_csv(train_file_path)

In [4]:
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [5]:
# label = {'positive':1, 'negative':-1}

# def preprocess_y(sentiment):
#     return label[sentiment]

# y = y.apply(preprocess_y)
# y.head()

In [6]:
X.head()

0    atlantis lost empire better movie thought neve...
1    wonderful film version bestselling book smash ...
2    sent prison le 10 40 year busted drug refusing...
3    bar none hilarious movie ever seen beginning f...
4    rather good movie americanised predictability ...
Name: content, dtype: object

In [7]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wanghengchao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
import re
def preprocess(review):
    #convert the tweet to lower case
    review.lower()
    #convert all urls to sting "URL"
    review = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',review)
    #convert all @username to "AT_USER"
    review = re.sub('@[^\s]+','AT_USER', review)
    #correct all multiple white spaces to a single white space
    review = re.sub('[\s]+', ' ', review)
    #convert "#topic" to just "topic"
    review = re.sub(r'#([^\s]+)', r'\1', review)
    tokens = word_tokenize(review)
    tokens = [w for w in tokens if not w in stop_words]
    return " ".join(tokens)

X = X.apply(preprocess)
X.head()

0    atlantis lost empire better movie thought neve...
1    wonderful film version bestselling book smash ...
2    sent prison le 10 40 year busted drug refusing...
3    bar none hilarious movie ever seen beginning f...
4    rather good movie americanised predictability ...
Name: content, dtype: object

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
def feature_extraction(data):
    tfv=TfidfVectorizer(sublinear_tf=True, stop_words = "english")
    features=tfv.fit_transform(data)
    pickle.dump(tfv.vocabulary_, open("svm_feature.pkl", "wb"))
    return features

data = np.array(X)
label = np.array(Y)
features = feature_extraction(data)

print(features)

  (0, 2641)	0.16341453880947646
  (0, 95261)	0.12246356610044269
  (0, 4958)	0.10562382004472992
  (0, 41846)	0.08757288118911812
  (0, 85637)	0.10678315419247991
  (0, 49945)	0.08526602790264595
  (0, 13701)	0.06764369661229365
  (0, 7334)	0.12264702703006267
  (0, 68495)	0.13570427598810866
  (0, 109632)	0.093199350861385
  (0, 78982)	0.08708889002879584
  (0, 115122)	0.10683262260610758
  (0, 19873)	0.1272228587845625
  (0, 65410)	0.06539732586794086
  (0, 114025)	0.1316637924488914
  (0, 50832)	0.060988817011347535
  (0, 20098)	0.07806878688785383
  (0, 126351)	0.18561920291698983
  (0, 121339)	0.13854515501675002
  (0, 13811)	0.17613929219192448
  (0, 46673)	0.1414523233478071
  (0, 37939)	0.2225921565596288
  (0, 127632)	0.060760143947124846
  (0, 4464)	0.12325255770879875
  (0, 6544)	0.1056002189701825
  :	:
  (24999, 76243)	0.14823796441277381
  (24999, 31605)	0.12539527184487945
  (24999, 109153)	0.10248865545395464
  (24999, 41585)	0.13358966417236628
  (24999, 20763)	0.10486

In [10]:
from sklearn.model_selection import train_test_split  
X_train, X_test, Y_train, Y_test = train_test_split(features, label, test_size = 0.20) 
X_train

<20000x132911 sparse matrix of type '<class 'numpy.float64'>'
	with 1749833 stored elements in Compressed Sparse Row format>

In [20]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='linear')  

svclassifier.fit(features, label)  

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
from sklearn.metrics import accuracy_score
val_pred = svclassifier.predict(X_test)
print(val_pred)
print(accuracy_score(Y_test, val_pred))

[1 1 1 ... 0 0 1]
0.86104


In [22]:
filename = 'svm_model.sav'
pickle.dump(svclassifier, open(filename, 'wb'))

Saving the model

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer
X_test1,Y_test1 = load_csv(test_file_path)
data_test = np.array(X_test1)
label_test = np.array(Y_test1)
transformer = TfidfTransformer()
tfv_loaded = TfidfVectorizer(sublinear_tf=True, stop_words = "english", vocabulary=pickle.load(open("svm_feature.pkl", "rb")))
features_test = transformer.fit_transform(tfv_loaded.fit_transform(data_test))
X_test1,Y_test1 = features_test, label_test

In [26]:
loaded_model = pickle.load(open(filename, 'rb'))
val_pred_test = loaded_model.predict(X_test1)
print(accuracy_score(Y_test1, val_pred_test))

0.86104
