In [194]:
'''    imports    '''
# -*- coding: utf-8 -*-

import numpy as np
import sklearn
import csv
import re

from time import time
from pandas import read_csv

from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

In [229]:
'''    functions    '''
def printEach(list):
    for item in list:
        print(item)
        
def printBestParameter(grid_search):
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

def translateNumbers(x):
    # translate indian numbers to arabic numbers
    x = x.replace(u'\u0660','0')
    x = x.replace(u'\u0661','1')
    x = x.replace(u'\u0662','2')
    x = x.replace(u'\u0663','3')
    x = x.replace(u'\u0664','4')
    x = x.replace(u'\u0665','5')
    x = x.replace(u'\u0666','6')
    x = x.replace(u'\u0667','7')
    x = x.replace(u'\u0668','8')
    x = x.replace(u'\u0669','9')
    return x
    
def removePunc(x):    
    # removes punctuation and arabic harakat
    punc = """!$%^&*()-=+.,:'"<>/\?"""
    arabic_semicolon = u"\u061B"
    arabic_comma = u"\u060C"
    arabic_question = u"\u061F"
    arabia_fatha = u"\u064E"
    arabia_2fathas = u"\u064B"
    arabia_damma = u"\u064F"
    arabia_2dammas = u"\u064C"
    arabia_kasra = u"\u0650"
    arabia_2kasras = u"\u064D"
    arabic_sukun = u"\u0652"
    arabic_shadda = u"\u0651"
    punc += arabic_semicolon + arabic_comma + arabic_question + arabia_fatha + arabia_2fathas + arabia_damma + arabia_2dammas + arabia_kasra + arabia_2kasras + arabic_sukun + arabic_shadda
    
    for char in punc:
        x = x.replace(char, "")
        
    return x

def uniformArabic(x):
    # uniform multiple characters to thier origrinal form
    # Alifs -> Alif
    x = x.replace(u'\u0622',u'\u0627')
    x = x.replace(u'\u0623',u'\u0627')
    x = x.replace(u'\u0625',u'\u0627')
    # Taa marbotah -> haa
    x = x.replace(u"\u0629",u"\u0647")
    # Alif_maqsurah + Yaa_Hamzah -> Yaa
    x = x.replace(u'\u0649',u'\u064A')
    x = x.replace(u'\u0626',u'\u064A')
    # Waw_hamzah -> waw
    x = x.replace(u'\u0624',u'\u0648')
    return x

def reduceString(x, string):
    # replaces two same consecutive characters with one
    stringString = string+string
    while stringString in x:
        x = x.replace(stringString, string)
    return x
    
def clean(x):
    # cleans the tweet
    x = translateNumbers(x)
    
    x = re.sub("(https|http)://t.co/([a-zA-Z0-9]){10}"," _LINK_ ",x) # Link
    x = re.sub("[0-9]{10,}"," _NUMBER_ ",x) # Number
    x = re.sub("@.*\s"," _ACCOUNT_ ",x) # Account
    
    x = removePunc(x)
    
    x = uniformArabic(x)
    
    x = reduceString(x, ' ')
    
    # reduce arabic letters
    arabic_letters = [u"\u0627",u"\u0628",u"\u062A",u"\u062B",u"\u062C",u"\u062D",u"\u062E",u"\u062F",u"\u0630",u"\u0631",u"\u0632",u"\u0633",u"\u0634",u"\u0635",u"\u0636",u"\u0637",u"\u0638",u"\u0639",u"\u063A",u"\u0641",u"\u0642",u"\u0643",u"\u0644",u"\u0645",u"\u0646",u"\u0647",u"\u0648",u"\u064A"]
    for letter in arabic_letters:
        x = reduceString(x, letter)
    
    # remove maddah
    x = x.replace(u'\u0640','')
    
    # ال + لا
    x = reduceString(x, u'\u0627\u0644')
    x = reduceString(x, u'\u0644\u0627')
        
    # english repeated letters
    x = reduceString(x, "a")
    x = reduceString(x, "s")
    x = reduceString(x, "h")
    x = reduceString(x, "o")
    x = reduceString(x, "y")
    
    x = reduceString(x, "_")
    
    return x
    
def cleanAll(X):
    # perform clean on all elements
    clean_x = []
    for x in X:
        clean_x.append(clean(x))
    return clean_x

def lenFeatures(X):
    vectorizer = TfidfVectorizer()
    vect = vectorizer.fit_transform(X)
    print("There are " + len(vectorizer.vocabulary_) + " features")
        
def listFeatures(X):
    # list the features of the tfidf vectorizer
    vectorizer = TfidfVectorizer()
    vect = vectorizer.fit_transform(X)
    vocab = []
    for voc in vectorizer.vocabulary_:
        vocab.append(voc)
    vocab.sort()
    print(len(vocab))
    print("\n")
    for voc in vocab:
        print voc+"\n"

def doGridSearch(pipeline, parameters):
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")

    printBestParameter(grid_search)

In [230]:
'''    main    '''
df = read_csv('corpusCinema.csv', engine='python', encoding="UTF-8")
array =  df.values

X = array[:, 0] #Tweet
Y = array[:, 1] #Sentiment

for n,i in enumerate(Y):
    if i==1:
        Y[n]="Positive"
    elif i==0:
        Y[n]="Neutral"
    elif i==-1:
        Y[n]="Negative"

X = cleanAll(X)
# listFeatures(X)
lenFeatures(X)
    
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=7)

TypeError: cannot concatenate 'str' and 'int' objects

In [216]:
pipeline = Pipeline([
    ('1', TfidfVectorizer()),
    ('2', TfidfTransformer()),
    ('3', SVC(kernel='linear', random_state=7, decision_function_shape='ovo'))
])
parameters = {
    '1__max_df': (0.5, 1.0, 1.5),
    
    '2__use_idf': (True, False),
    '2__norm': ('l1', 'l2'),
    
#     '3__C': (0.1, 1.0, 10),
#     '3__tol': (1e-4,  1e-6, 1e-8),
    #'3__class_weight': (None, 'balanced'),
    '3__degree': ( 1,3),
    
}

doGridSearch(pipeline, parameters)
# done in 80.018s
# Best score: 0.756
# Best parameters set:
# 	1__max_df: 0.5
# 	2__norm: 'l2'
# 	2__use_idf: False
# 	3__degree: 1

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.3min finished


done in 80.018s
Best score: 0.756
Best parameters set:
	1__max_df: 0.5
	2__norm: 'l2'
	2__use_idf: False
	3__degree: 1


In [211]:
pipeline = Pipeline([
    ('1', TfidfVectorizer()),
    ('2', TfidfTransformer()),
    ('3', MultinomialNB())
])
parameters = {
    '1__max_df': (0.25, 0.5, 0.75, 1.0),
    
    '2__use_idf': (True, False),
    '2__norm': ('l1', 'l2'),
    
    '3__alpha': (1.0e-10, 1, 10),
}

doGridSearch(pipeline, parameters)
# done in 14.386s
# Best score: 0.722
# Best parameters set:
# 	1__max_df: 0.5
# 	2__norm: 'l1'
# 	2__use_idf: False
# 	3__alpha: 1e-10

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   14.0s finished


done in 14.386s
Best score: 0.722
Best parameters set:
	1__max_df: 0.5
	2__norm: 'l1'
	2__use_idf: False
	3__alpha: 1e-10


In [212]:
pipeline = Pipeline([
    ('1', TfidfVectorizer()),
    ('2', TfidfTransformer()),
    ('3', RandomForestClassifier(random_state=7))
])
parameters = {
#     '1__max_df': (0.25, 0.5, 0.75, 1.0), # 1.0
#     '1__use_idf': (True, False), # true
#     '1__norm': ('l1', 'l2'), # l2
    
#     '2__use_idf': (True, False), # true
#     '2__norm': ('l1', 'l2'), # l2
    
    '3__n_estimators': (5, 10, 15, 20, 25, 35, 50), # 10
    '3__criterion': ('gini', 'entropy'), # gini
    '3__class_weight': (None, 'balanced'), # None
}

doGridSearch(pipeline, parameters)
# done in 37.841s
# Best score: 0.722
# Best parameters set:
# 	3__class_weight: None
# 	3__criterion: 'entropy'
# 	3__n_estimators: 50

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:   34.7s finished


done in 37.841s
Best score: 0.722
Best parameters set:
	3__class_weight: None
	3__criterion: 'entropy'
	3__n_estimators: 50


In [214]:
pipeline = Pipeline([
    ('1', TfidfVectorizer()),
    ('2', TfidfTransformer()),
    ('3', LogisticRegression())
])
parameters = {
    '1__max_df': (0.5, 0.75, 1.0),
    
    '2__use_idf': (True, False),
    '2__norm': ('l1', 'l2'),
}

doGridSearch(pipeline, parameters)
# done in 4.347s
# Best score: 0.724
# Best parameters set:
# 	1__max_df: 1.0
# 	2__norm: 'l2'
# 	2__use_idf: False

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    4.2s finished


done in 4.565s
Best score: 0.724
Best parameters set:
	1__max_df: 1.0
	2__norm: 'l2'
	2__use_idf: False
