In [1]:
# Imports and stuff
import os
import time
import csv

#Libraries
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction import stop_words
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold

#Powerpuff-stuff
import helpers as HL

import cleaning as CL

import tolken as TO

# Defining paths for data: 

In [2]:
# RUNNING ON TRAINING SET WITH CROSS VALIDATION

training_set_pos = "train_pos.txt" 
training_set_neg = "train_neg.txt"
training_set_full = "train_full.txt"
test_set = "test_data.txt"

# Creating corpus with training data:

In [3]:
inputfiles=[training_set_pos, training_set_neg]

original_corpus, file_lengths=HL.create_corpus(inputfiles)

In [4]:
print(original_corpus[0])

b'<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15\n'


In [5]:
print(len(original_corpus))

200000


# Creating corresponding labels: 

In [6]:
nr_pos_lines=file_lengths[0]
nr_neg_lines=file_lengths[1]

nr_lines_total=sum(file_lengths)

labels=HL.create_labels(nr_pos_lines,nr_neg_lines)

# Testing tolken

In [7]:
tolken_corpus=TO.replace_words(original_corpus)

# Defining stoppingwords and removing them

In [8]:
#Defining the list: 
#sklearn_stops = stop_words.ENGLISH_STOP_WORDS
#custom_stop_words = set(sklearn_stops)
#negation_words = set(["neither", "nor", "not", "either", "never", "no", "nobody", "nothing",
#                     "nowhere", "none", "little", "few", "rarely", "hardly", "seldom", "only",
#                     "can't", "cannot", "without", "noone"])
#tweet_words = set(["<user>","<url>"])
#custom_stop_words = custom_stop_words- negation_words
#custom_stop_words = custom_stop_words.union(tweet_words)- negation_words


custom_stop_words=set(["the","is","<url>","<user>",".",","])

In [9]:
#removing the stopwords:
stop_words_removed_corpus=CL.remove_stopwords(tolken_corpus, custom_stop_words)

In [10]:
print(stop_words_removed_corpus[6])

b'1dnextalbumtitle feel for you / rollercoast of life song cocept life yolo becom famou ? heart followmeplz ! heart x15 exclamation hashtag thereisanumber'


# Creating cluster dictionary and setting words to their cluster ID

In [11]:
#creating dictionary
cluster_file="50mpaths2.txt"
cluster_dictionary=CL.create_dictionary(cluster_file)

In [12]:
#setting words to their cluster ID
clusteded_corpus=CL.create_clusterized_corpus(tolken_corpus,cluster_dictionary) #stop_words_removed_corpus, cluster_dictionary)

In [13]:
print(clusteded_corpus[0])

b'<user> 0000 011110101100 1110011001111 01011000111 110110 01010101001 1001100 001000 00101100 1110011001111 100110111 1110011001110 010001001 0110111010 011010100 011010000 0000 01000111110 01110100 001110111 010110100 01110010 010001011101 1111011111010010 thereisanumber'


# Want to implement n-grams

In [14]:
n_grams=2
n_grams_corpus=HL.creating_n_grams_cropus(n_grams, tolken_corpus)


In [15]:
print(n_grams_corpus[0])

b'<user> i dunno justin read my mention or not onli justin and god know about that but i hope you will follow me believe hashtag thereisanumber _-<user> <user>-i i-dunno dunno-justin justin-read read-my my-mention mention-or or-not not-onli onli-justin justin-and and-god god-know know-about about-that that-but but-i i-hope hope-you you-will will-follow follow-me me-believe believe-hashtag hashtag-thereisanumber thereisanumber-_'


# Starting to do shit

In [16]:
#picking corpus
#corpuses=[original_corpus,stop_words_removed_corpus,clusteded_corpus, n_grams_corpus]
corpuses=[original_corpus, tolken_corpus,stop_words_removed_corpus,clusteded_corpus, n_grams_corpus]

In [17]:
#Variables for measuring accuracy -----------------------

totalsvm = 0   # Accuracy measure 
totalMatSvm = np.zeros((2,2));  # Confusion matrix 

In [18]:
for corpus in corpuses: 
    #Variables for measuring accuracy -----------------------

    totalsvm = 0   # Accuracy measure 
    totalMatSvm = np.zeros((2,2));  # Confusion matrix 

    #Running algorithm with K-fold ---------------------------

    #Creating K-fold = 10 for cross validation
    kf = StratifiedKFold(n_splits=3)

    for train_index, test_index in kf.split(corpus,labels):

        X_train = [corpus[i]for i in train_index]
        X_test  = [corpus[i]for i in test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        vectorizer = TfidfVectorizer(
            min_df=10, # removing word that occure less then 10 times 
            max_df = 0.9, # remove words that are too frequent ( more then 0.8 * number of tweets )
            sublinear_tf=True, # scale the term frequency in logarithmic scale
            use_idf =True
        )
        train_corpus_tf_idf = vectorizer.fit_transform(X_train) 
        test_corpus_tf_idf = vectorizer.transform(X_test)

        model1 = LinearSVC()
        model1.fit(train_corpus_tf_idf,y_train)
        result1 = model1.predict(test_corpus_tf_idf)

        print(result1)

        totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
        totalsvm = totalsvm+sum(y_test==result1)

    print(totalMatSvm, "\n", "Total accuracy:", totalsvm/nr_lines_total)  


[ 0.  0.  1. ...,  0.  0.  0.]
[ 1.  1.  1. ...,  0.  0.  0.]
[ 0.  1.  0. ...,  0.  0.  0.]
[[ 76839.  23161.]
 [ 18194.  81806.]] 
 Total accuracy: 0.793225
[ 0.  1.  1. ...,  0.  0.  0.]
[ 1.  1.  1. ...,  0.  0.  0.]
[ 1.  1.  0. ...,  0.  0.  0.]
[[ 77118.  22882.]
 [ 18046.  81954.]] 
 Total accuracy: 0.79536
[ 0.  1.  1. ...,  0.  0.  0.]
[ 1.  1.  1. ...,  0.  0.  0.]
[ 1.  1.  0. ...,  0.  0.  0.]
[[ 76899.  23101.]
 [ 18109.  81891.]] 
 Total accuracy: 0.79395
[ 1.  0.  1. ...,  0.  1.  0.]
[ 1.  0.  1. ...,  1.  1.  0.]
[ 0.  1.  0. ...,  0.  0.  0.]
[[ 75984.  24016.]
 [ 19544.  80456.]] 
 Total accuracy: 0.7822
[ 0.  1.  1. ...,  0.  0.  0.]
[ 1.  1.  1. ...,  0.  0.  0.]
[ 1.  1.  0. ...,  0.  0.  0.]
[[ 77035.  22965.]
 [ 17956.  82044.]] 
 Total accuracy: 0.795395


In [None]:
import gridsearch as GS

GS.gridsearch(n_grams_corpus,labels,nr_lines_total)

Best score:  0.781805 
 With min_df: 4  and  Max_df: 0.1
Best score:  0.789835 
 With min_df: 4  and  Max_df: 0.235714285714
Best score:  0.79025 
 With min_df: 4  and  Max_df: 0.371428571429
Best score:  0.790645 
 With min_df: 4  and  Max_df: 0.507142857143
Best score:  0.791085 
 With min_df: 4  and  Max_df: 0.642857142857


# Kaggle delivery:

In [None]:
# DATA READING
start = time.time()
training_set_pos = "train_pos_full.txt" 
training_set_neg = "train_neg_full.txt"

test_set = "test_data.txt"

elapsed = time.time()

print ("Time in min:", (elapsed - start) / 60 )

In [None]:
start = time.time()
## Defining parameters for preprocessing:
custom_stop_words=set(["the","is","<url>","<user>",".",","])

#creating dictionary
cluster_file="50mpaths2.txt"
cluster_dictionary=CL.create_dictionary(cluster_file)

#picking number of in n-grams
n_grams=2


elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

In [None]:
start = time.time()

#### CREATING TRAINING CORPUS #### 
#original corpus:
inputfiles_train=[training_set_pos, training_set_neg]
original_corpus_train, file_lengths_train=HL.create_corpus(inputfiles_train)

#remove stopwords
#stop_words_removed_corpus_train=CL.remove_stopwords(original_corpus_train, custom_stop_words)

#using clusters
clusteded_corpus_train=CL.create_clusterized_corpus(original_corpus_train, cluster_dictionary)
#stop_words_removed_corpus_train, cluster_dictionary)

#ngrams:
n_grams_corpus_train=HL.creating_n_grams_cropus(n_grams, clusteded_corpus_train)


elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

In [None]:
start = time.time()

#### CREATING TEST CORPUS #### 
#original corpus:
inputfiles_test=[test_set]
original_corpus_test, file_lengths_test=HL.create_corpus(inputfiles_test)

#remove stopwords
#stop_words_removed_corpus_test=CL.remove_stopwords(original_corpus_test, custom_stop_words)

#using clusters
clusteded_corpus_test=CL.create_clusterized_corpus(original_corpus_test,cluster_dictionary)#stop_words_removed_corpus_test, cluster_dictionary)

#ngrams:
n_grams_corpus_test=HL.creating_n_grams_cropus(n_grams, clusteded_corpus_test)


elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

In [None]:
start = time.time()

##creating labels
nr_pos_lines=file_lengths_train[0]
nr_neg_lines=file_lengths_train[1]

nr_lines_total=sum(file_lengths_train)

labels=HL.create_labels(nr_pos_lines,nr_neg_lines,kaggle=True)


elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

In [None]:
GS.gridsearch(n_grams_corpus_train,labels,nr_lines_total)

In [None]:
start = time.time()

vectorizer = TfidfVectorizer(
    min_df=10, # removing word that occure less then 10 times 
    max_df = 0.97, # remove words that are too frequent ( more then 0.8 * number of files )
    sublinear_tf=True, # scale the term frequency in logarithmic scale
    use_idf =True, 
    #stop_words = 'english' # Removing stop-words
)

#CLASSIFYING WITH SVM
model = LinearSVC()

elapsed = time.time()

print ("Time in min:", (elapsed - start) / 60 )

In [None]:
start = time.time()

#VECTORIZING ( Basically the preprocessing, see: TfidfVectorizer)
train_corpus_fittrans = vectorizer.fit_transform(n_grams_corpus_train) 
test_corpus_trans = vectorizer.transform(n_grams_corpus_test)

elapsed = time.time()

print ("Time in min:", (elapsed - start) / 60 )

In [None]:
start = time.time()

# FITTING THE MODEL
model.fit(train_corpus_fittrans, labels)

elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

In [None]:
result1 = model.predict(train_corpus_fittrans)

In [None]:
start = time.time()

#PREDICTING
submission_prediction = model.predict(test_corpus_trans)

elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )

In [None]:
print((submission_prediction))

In [None]:
start = time.time()

#CREATING SUBMISSION
ids = list(range(1,nr_lines_total+1))
y_pred = submission_prediction
name = "keggle_submission_cluster_27_11_new.csv"

HL.create_csv_submission(ids, y_pred, name)

elapsed = time.time()
print ("Time in min:", (elapsed - start) / 60 )