In [1]:
%matplotlib inline
import IPython.core.display         
# setup output image format (Chrome works best)
IPython.core.display.set_matplotlib_formats("svg")
import matplotlib.pyplot as plt
import matplotlib
from numpy import *
from sklearn import *
from scipy import stats
import csv
random.seed(100)

In [2]:
def read_text_data(fname):
    txtdata = []
    classes = []
    topics  = []
    with open(fname, 'r', encoding='UTF-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in reader:
            # get the text
            txtdata.append(row[0])
            # get the class (convert to integer)
            if len(row)>1:
                classes.append(row[1])
                topics.append(row[2])
    
    if (len(classes)>0) and (len(txtdata) != len(classes)):        
        raise Exception("mismatched length!")
    
    return (txtdata, classes, topics)

def read_csv_data(fname):
    txtdata = []
    classes = []
    topics  = []
    with open(fname, 'r', encoding='UTF-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in reader:
            # get the text
            txtdata.append(row[1])
            # get the class (convert to integer)
#             if len(row)>1:
#                 classes.append(row[1])
#                 topics.append(row[2])
    
#     if (len(classes)>0) and (len(txtdata) != len(classes)):        
#         raise Exception("mismatched length!")
    
    return (txtdata, classes, topics)

def write_csv_kaggle_sub(fname, Y):
    # fname = file name
    # Y is a list/array with class entries
    
    # header
    tmp = [['Id', 'Prediction']]
    
    # add ID numbers for each Y
    for (i,y) in enumerate(Y):
        tmp2 = [(i+1), y]
        tmp.append(tmp2)
        
    # write CSV file
    with open(fname, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(tmp)

In [4]:
# load the data
(traintxt, trainY, _) = read_text_data("sanders_tweets_train.txt")
(testtxt, _, _)       = read_csv_data("result_olympic.csv")

print(len(traintxt))
print(len(testtxt))

2396
312792


In [7]:
# test
print(testtxt[1])

['rejected', 'include', 'bowling', 'chess', 'tug', 'war']


In [8]:
# Bag-of-Words representation

cntvect = feature_extraction.text.CountVectorizer(stop_words='english', max_features=4500)

# create the vocabulary
cntvect.fit(traintxt)

# calculate the vectors for the training data
trainXbow = cntvect.transform(traintxt)

# calculate vectors for the test data
testXbow = cntvect.transform(testtxt)

# print the vocabulary
print(cntvect.vocabulary_)



In [9]:
# SVM with RBF kernel
paramgrid = {'C': logspace(-2,3,20), 'gamma': logspace(-4,3,20) }
print(paramgrid)

svmrbf = model_selection.GridSearchCV(svm.SVC(kernel='rbf'), paramgrid, cv=5, n_jobs=4, verbose=True)

{'C': array([1.00000000e-02, 1.83298071e-02, 3.35981829e-02, 6.15848211e-02,
       1.12883789e-01, 2.06913808e-01, 3.79269019e-01, 6.95192796e-01,
       1.27427499e+00, 2.33572147e+00, 4.28133240e+00, 7.84759970e+00,
       1.43844989e+01, 2.63665090e+01, 4.83293024e+01, 8.85866790e+01,
       1.62377674e+02, 2.97635144e+02, 5.45559478e+02, 1.00000000e+03]), 'gamma': array([1.00000000e-04, 2.33572147e-04, 5.45559478e-04, 1.27427499e-03,
       2.97635144e-03, 6.95192796e-03, 1.62377674e-02, 3.79269019e-02,
       8.85866790e-02, 2.06913808e-01, 4.83293024e-01, 1.12883789e+00,
       2.63665090e+00, 6.15848211e+00, 1.43844989e+01, 3.35981829e+01,
       7.84759970e+01, 1.83298071e+02, 4.28133240e+02, 1.00000000e+03])}


In [10]:
svmrbf.fit(trainXbow, trainY);
print("best params:", svmrbf.best_params_)

# predict the testset: SVM with RBF kernel
predY = svmrbf.best_estimator_.predict(testXbow)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   28.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:  5.2min finished


best params: {'C': 4.281332398719392, 'gamma': 0.08858667904100823}


In [None]:
write_csv_kaggle_sub("result_olympic_predict.csv", predY)