In [56]:
# Imports and stuff
import os
import time
import csv

#Libraries
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold

#Powerpuff-stuff
import helpers


### Running for local testing

In [58]:
# RUNNING ON TRAINING SET WITH CROSS VALIDATION

training_set_pos = "train_pos.txt" 
training_set_neg = "train_neg.txt"
training_set_full = "train_full.txt"
test_set = "test_data.txt"

#Couning nr. of lines in pos and neg training_set
nr_pos_lines = 0
nr_neg_lines = 0

#Creating corpus ---------------------------------------
corpus = []

#Adding lines from positive training set to CORPUS
with open(training_set_pos,'rb') as infile:
    for line in infile:
        corpus.append(line)

nr_pos_lines = len(corpus)
        
#Adding lines from negative training set to CORPUS
with open(training_set_neg,'rb') as infile:
    for line in infile:
        corpus.append(line)
        
nr_neg_lines = len(corpus) - nr_pos_lines

#Finding tot nr. of lines
nr_lines_total = nr_pos_lines + nr_neg_lines

#Creating labels-array  ---------------------------------    

#Making labels, setting pos = 1 and neg = 0 
labels = np.zeros(nr_lines_total);
labels[0:nr_lines_pos]=1;
labels[nr_lines_pos:nr_lines_total]=0;

#Variables for measuring accuracy -----------------------

totalsvm = 0   # Accuracy measure 
totalMatSvm = np.zeros((2,2));  # Confusion matrix 

#Running algorithm with K-fold ---------------------------

#Creating K-fold = 10 for cross validation
kf = StratifiedKFold(n_splits=3)

for train_index, test_index in kf.split(corpus,labels):
    
    X_train = [corpus[i] for i in train_index]
    X_test = [corpus[i] for i in test_index]
    
    y_train, y_test = labels[train_index], labels[test_index]
    
    vectorizer = TfidfVectorizer(
        min_df=5, # removing word that occure less then 10 times 
        max_df = 0.8, # remove words that are too frequent ( more then 0.8 * number of files )
        sublinear_tf=True, # scale the term frequency in logarithmic scale
        use_idf =True, 
        stop_words = 'english' # Removing stop-words
    )
    
    train_corpus_tf_idf = vectorizer.fit_transform(X_train) 
    test_corpus_tf_idf = vectorizer.transform(X_test)

    model1 = LinearSVC()
    model1.fit(train_corpus_tf_idf,y_train)
    result1 = model1.predict(test_corpus_tf_idf)
    
    print(result1)
    
    totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
    totalsvm = totalsvm+sum(y_test==result1)
    
    
print(totalMatSvm, "\n", "Total accuracy:", totalsvm/nr_lines_total)  


[ 0.  0.  1. ...,  0.  0.  0.]
[ 1.  0.  1. ...,  0.  0.  0.]
[ 0.  1.  0. ...,  0.  0.  0.]
[[ 74396.  25604.]
 [ 19708.  80292.]] 
 Total accuracy: 0.77344


### Running for keggle delivery

In [46]:
start = time.time()

# DATA READING AND PREPROCESSING ------------------------------------------

training_set_pos = "train_pos_full.txt" 
training_set_neg = "train_neg_full.txt"

test_set = "test_data.txt"

#Couning nr. of lines in pos and neg training_set
nr_pos_lines = 0
nr_neg_lines = 0


#### CREATING CORPUS #### 
corpus = []

#Adding lines from positive training set to CORPUS
with open(training_set_pos,'rb') as infile:
    for line in infile:
        corpus.append(line)

nr_pos_lines = len(corpus)
print("pos_lines:", nr_pos_lines)
        
#Adding lines from negative training set to CORPUS
with open(training_set_neg,'rb') as infile:
    for line in infile:
        corpus.append(line)
        
nr_neg_lines = len(corpus) - nr_pos_lines
print("neg_lines:", nr_neg_lines)

nr_lines_total = nr_pos_lines + nr_neg_lines
print("total_lines:", nr_lines_total)

##########################


#### CREATING TEST CORPUS ####

test_corpus = []

with open(test_set,'rb') as infile:
    for line in infile:
        test_corpus.append(line)
        
        
##############################

#Making pos/neg labels, setting pos = 1 and neg = 0 
labels = np.zeros(nr_lines_total);
labels[0:nr_pos_lines]=1;
labels[nr_pos_lines:nr_lines_total]=-1;

vectorizer = TfidfVectorizer(
    min_df=5, # removing word that occure less then 10 times 
    max_df = 0.8, # remove words that are too frequent ( more then 0.8 * number of files )
    sublinear_tf=True, # scale the term frequency in logarithmic scale
    use_idf =True, 
    stop_words = 'english' # Removing stop-words
)



#CLASSIFYING WITH SVM
model = LinearSVC()

elapsed = time.time()

print ("Time in min:", (elapsed - start) / 60 )


pos_lines: 1250000
neg_lines: 1250000
total_lines: 2500000
Time in min: 0.01910977760950724


In [47]:
start = time.time()

#VECTORIZING ( Basically the preprocessing, see: TfidfVectorizer)
train_corpus_fittrans = vectorizer.fit_transform(corpus) 
test_corpus_trans = vectorizer.transform(test_corpus)

elapsed = time.time()

print ("Time in min:", (elapsed - start) / 60 )

Time in min: 0.7312580029169718


In [48]:
start = time.time()

# FITTING THE MODEL
model.fit(train_corpus_fittrans, labels)

elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

Time in min: 10.896052078406017


In [49]:
start = time.time()

#PREDICTING
submission_prediction = model.predict(test_corpus_trans)

elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

Time in min: 3.343025843302409e-05


In [57]:
start = time.time()

#CREATING SUBMISSION
ids = list(range(1,nr_lines_total+1))
y_pred = submission_prediction
name = "keggle_submission.csv"

create_csv_submission(ids, y_pred, name)

elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

Time in min: 0.002280151844024658
