/
TfIdf.final.py
56 lines (46 loc) 路 2.39 KB
/
TfIdf.final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import pandas as pd
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from tools import *
def main():
logger = logging.getLogger('my_logger')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
t = tools()
logger.info('Read train data...')
train_o = pd.read_csv('data/in/en_sentiment.tsv', header=0, delimiter='\t', quoting=3)
train = train_o.loc[(train_o['sentiment'] != 'neutral') & (train_o['sentiment'] != 'na'), ['sentiment','tweet']]
train['sent_num'] = train.apply(lambda train: 1 if train['sentiment'] == 'positive' else -1, axis=1)
clean_train_tweets = []
logger.info('Cleaning and parsing the training set...')
for index, row in train.iterrows():
clean_train_tweets.append(t.to_words(row['tweet'], True, True, True))
logger.info('Creating the tf-idf of words from training set...')
vectorizer_tfidf = TfidfVectorizer(min_df=1)
train_data_features = vectorizer_tfidf.fit_transform(clean_train_tweets)
train_data_features = train_data_features.toarray()
logger.debug(train_data_features.shape)
logger.info('Read the test data...')
test_1 = pd.read_csv('data/in/experiment-1_twitter.csv', header=0, delimiter=',', quoting=0)
test_2 = pd.read_csv('data/in/experiment-2_twitter.csv', header=0, delimiter=',', quoting=0)
frames = [test_1, test_2]
test = pd.concat(frames)
clean_test_tweets = []
logger.info('Cleaning and parsing the test set ...')
for index, row in test.iterrows():
clean_test_tweets.append(t.to_words(row['text'], True, True, True))
logger.info('Creating the tf-idf of words from test set...')
test_data_features = vectorizer_tfidf.transform(clean_test_tweets)
test_data_features = test_data_features.toarray()
logger.debug(test_data_features.shape)
ml_methods = ['knn', 'decision_tree', 'random_forest', 'logistic_regression', 'naive_bayes', 'svm', 'gradient_boosting']
for ml_method in ml_methods:
logger.info('Prediction with %s...' % (ml_method))
file_name = 'data/out/tfidf_%s_score.csv' % (ml_method)
t.ml_classify(ml_method, train_data_features, train['sent_num'], test_data_features, file_name,True)
if __name__ == '__main__':
main()