In [2]:
# Imports and stuff
import os
import time
import csv

#Libraries
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction import stop_words
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.svm import NuSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold

#Powerpuff-stuff
import helpers as HL
import tolken as TO
import preprocessing as PP

### Running for local testing


In [2]:
# RUNNING ON TRAINING SET WITH CROSS VALIDATION

training_set_pos = "train_pos.txt" 
training_set_neg = "train_neg.txt"
test_set = "test_data.txt"

inputfiles=[training_set_pos, training_set_neg]

corpus, file_lengths=HL.create_corpus(inputfiles)

In [3]:
corpus = PP.add_features(corpus)

In [4]:
corpus = PP.stem_words(corpus)

In [8]:
my_stopword_list = ['and','to','the','of','in','there']

In [None]:
nr_pos_lines = 100000
nr_lines_total =200000

#Making labels, setting pos = 1 and neg = 0 
labels = np.zeros(nr_lines_total);
labels[0:nr_pos_lines]=1;
labels[nr_pos_lines:nr_lines_total]=0;

#Variables for measuring accuracy -----------------------

totalsvm = 0   # Accuracy measure 
totalMatSvm = np.zeros((2,2));  # Confusion matrix 

#Running algorithm with K-fold ---------------------------

#Creating K-fold = 10 for cross validation
kf = StratifiedKFold(n_splits=5)

for train_index, test_index in kf.split(corpus,labels):
    
    X_train = [corpus[i] for i in train_index]
    X_test = [corpus[i] for i in test_index]
    
    y_train, y_test = labels[train_index], labels[test_index]
    
    vectorizer = TfidfVectorizer(
        min_df = 0.0001, # removing word that occure less then 10 times 
        max_df = 0.5, # remove words that are too frequent ( more then 0.8 * number of files )
        sublinear_tf=True, # scale the term frequency in logarithmic scale
        max_features = 3000,
        use_idf = False, 
        stop_words = my_stopword_list,
        ngram_range=(1,3)
    )
    
    train_corpus_tf_idf = vectorizer.fit_transform(X_train) 
    test_corpus_tf_idf = vectorizer.transform(X_test)

    model1 = LinearSVC()
    model1.fit(train_corpus_tf_idf,y_train)
    result1 = model1.predict(test_corpus_tf_idf)
    
    print(result1)
    
    totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
    totalsvm = totalsvm+sum(y_test==result1)
    
print(totalMatSvm, "\n", "Total accuracy:", totalsvm/nr_lines_total)  

## FOR KAGGLE DELIVERY

In [3]:
########### LOADING IN FILES, CREATING CORPUS + LABELS ###########

training_set_pos = "train_pos_full.txt" 
training_set_neg = "train_neg_full.txt"
test_set = "test_data.txt"

corpus, file_lengths=HL.create_corpus([training_set_pos, training_set_neg])
test_corpus, test_coprus_length = HL.create_corpus([test_set])

#Couning nr. of lines in pos and neg training_set
nr_pos_lines = 1250000
nr_neg_lines = 1250000

labels = HL.create_labels(nr_neg_lines, nr_pos_lines)

In [4]:
start = time.time()

########### PREPROCESSING ###########

corpus = PP.add_features(corpus)
corpus = PP.stem_words(corpus)

test_corpus = PP.add_features(test_corpus)
test_corpus = PP.stem_words(test_corpus)

print ("Time in min:", (time.time() - start) / 60 )

TypeError: 'module' object is not callable

In [6]:
############ SELECTING MODEL ###########
model = LinearSVC()

In [9]:
########### DEFINING TF-IDF VECTORIZER ###########
vectorizer = TfidfVectorizer(
        min_df = 0.0001, # removing word that occure less then 10 times 
        max_df = 0.5, # remove words that are too frequent ( more then 0.8 * number of files )
        sublinear_tf=True, # scale the term frequency in logarithmic scale
        max_features = 3000,
        use_idf = False, 
        stop_words = my_stopword_list,
        ngram_range=(1,3)
)

In [10]:
start = time.time()

train_corpus_fittrans = vectorizer.fit_transform(corpus) 

test_corpus_trans = vectorizer.transform(test_corpus)

print ("Time in min:", (time.time() - start) / 60 )

Time in min: 9.949413025379181


In [11]:
start = time.time()

# FITTING THE MODEL
model.fit(train_corpus_fittrans, labels)

elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

Time in min: 25.166172846158346


In [12]:
start = time.time()

#PREDICTING
submission_prediction = model.predict(test_corpus_trans)
print(submission_prediction)

elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

[ 0.  0.  0. ...,  0.  1.  0.]
Time in min: 0.0017548521359761556


In [14]:
submission_prediction = [-1 if val==0 else val for val in submission_prediction]

In [16]:
start = time.time()

#CREATING SUBMISSION
ids = list(range(1,10000+1))
y_pred = submission_prediction
name = "keggle_submission_idftf_full.csv"

HL.create_csv_submission(ids, y_pred, name)

elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

Time in min: 0.0017630338668823242
