-
Notifications
You must be signed in to change notification settings - Fork 1
/
training.py
156 lines (136 loc) · 5.62 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# -*- coding: UTF-8 -*-
import pandas as pd
from nltk.corpus import stopwords, movie_reviews
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm, naive_bayes
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
def preprocess(checkpoint=True):
"""
Reads, gives format and concatenate data frames into one.
:param checkpoint: True to save data frame: bool
"""
# getting nltk dataset:
documents = [(movie_reviews.raw(fileid), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
# data framing
nltk_df = pd.DataFrame()
for review, category in documents:
temp = pd.DataFrame(data={'text':review, 'category':category}, index=[0])
nltk_df = nltk_df.append(temp)
nltk_df.reset_index(drop=True, inplace=True)
nltk_df['category'] = nltk_df['category'].map(lambda x: 0 if x=='neg' else 1)
# getting tweets dataset from stanford:
tweets_df = pd.read_csv('/Mining_The_Social_Web/datasets/tweetsstanford_training.csv',
sep=',', header=None, names=['category', 'id', 'date', 'query', 'user', 'text'])
tweets_df['category'] = tweets_df['category'].map(lambda x: 1 if x==4 else 0)
# getting dataset from University of Michigan:
umich_df = pd.read_csv('/Mining_The_Social_Web/datasets/umich_training.txt',
sep="\t", header = None, names=['category', 'text'])
# getting reviews dataset from Amazon:
amazon_df = pd.read_csv('/Mining_The_Social_Web/datasets/amazon_cells_labelled.txt',
sep="\t", header = None, names=['text', 'category'])
# getting review dataset from IMDB
imdb_df = pd.read_csv('/Mining_The_Social_Web/datasets/imdb_labelled.txt',
sep="\t", header = None, names=['text', 'category'])
# getting review dataset from Yelp
yelp_df = pd.read_csv('/Mining_The_Social_Web/datasets/yelp_labelled.txt',
sep="\t", header = None, names=['text', 'category'])
# concatenate ALL:
trainset_df = pd.concat([nltk_df, tweets_df[['category', 'text']], umich_df, yelp_df,imdb_df, amazon_df])
trainset_df.reset_index(drop=True, inplace=True)
if checkpoint:
trainset_df.to_csv(path_or_buf='/Mining_The_Social_Web/datasets/alltrainset.csv',
header=['category', 'text'], columns=['category', 'text'], index=None, sep='\t', mode='w')
return trainset_df
def replace_text(text, replace_list, replace_by):
"""
Replaces items in replace_list by items in replace_by, from a text.
:param text: str
:param replace_list: list
:param replace_by: str
:return: new text: str
"""
if replace_list:
replace_list = list(set(replace_list))
for i in xrange(len(replace_list)):
text = text.replace(replace_list[i], replace_by.format(replace_list[i]))
return text
def clean_text(tset, to_unicode=True):
"""
Replaces undesirable characters and transform to unicode if needed.
:param tset: str
:param to_unicode:bool
:return: clean text: str
"""
# undesirable chars out!
to_del = re.findall(r"[^\w\d\s+-.,!@#$%^&*();\\\/|<>:\"\']", tset, re.IGNORECASE)
tset = replace_text(text=tset, replace_list=to_del, replace_by="")
if to_unicode and type(tset) != unicode:
tset = tset.decode('utf8', 'ignore')
tset = re.sub(r"\s{2,}", " ", tset)
return tset
def svm_classifier(X, y, sw=False, checkpoint=True):
# stopwords
stop_words = set(stopwords.words('english')) if sw else None
# initialize model to vectorize
vec = TfidfVectorizer(lowercase=True, use_idf = True, norm='l2', smooth_idf=False, analyzer='word',
input='content', stop_words=stop_words, min_df=10, max_features=20000, ngram_range=(1, 2), sublinear_tf=True)
# initialize svm model
svm_clf =svm.LinearSVC(C=0.1)
# Pipeline
vec_svm = Pipeline([('vectorize', vec), ('svm', svm_clf)])
# train with all data
vec_svm.fit(X, y)
# save model
if checkpoint:
filename = '/Mining_The_Social_Web/models/svmtfidf.sav'
joblib.dump(vec_svm ,filename)
# return
return vec_svm
def nb_classifier(X, y, sw=False, checkpoint=True):
# stopwords
stop_words = set(stopwords.words('english')) if sw else None
# initialize model to vectorize
vec = TfidfVectorizer(lowercase=True, use_idf = True, norm=None, smooth_idf=False,
analyzer='word', input='content', stop_words=stop_words, min_df=10, max_features=20000)
# initialize
mnb_clf = naive_bayes.MultinomialNB()
# Pipeline
vec_nb = Pipeline([('vectorize', vec), ('mnb', mnb_clf)])
# fit model
vec_nb.fit(X, y)
# save model
if checkpoint:
filename = '/Mining_The_Social_Web/models/nbtfidf.sav'
joblib.dump(vec_nb ,filename)
return vec_nb
def main():
# load training dataset
trainset_df = pd.read_csv('/Mining_The_Social_Web/datasets/alltrainset.csv',
sep='\t', header=0, names=['category', 'text'])
# preprocess text
trainset_df['text'] = trainset_df['text'].map(lambda x: clean_text(tset=x) if x else x)
# data
X = trainset_df['text'].values
y = trainset_df['category'].values
# data partition
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=6)
# fit model
svmclf = svm_classifier(X=x_train, y=y_train, sw=False, checkpoint=True)
# get average accuracy
result = svmclf.score(x_test, y_test)
# predict 0 or 1 Conf Matrix
y_pred = svmclf.predict(x_test)
# confusion matrix
confusion_m = confusion_matrix(y_test, y_pred)
# show results:
print ("accuracy: " + str(result))
print ("Confusion Matrix: \n" + str(confusion_m))
print(classification_report(y_test, y_pred))
if __name__ == '__main__':
main()