In [21]:
#import statements
import pandas as pd
import numpy as np
import os
import re
import time
import nltk
#for dictionary creation
from bs4 import BeautifulSoup
import requests, json
import ast
#feature extraction
from textblob import TextBlob
from nltk import word_tokenize,pos_tag_sents,WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
#classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC 
#obtain accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
# Reads the emoticon look up table file
header = ['EmoticonSymbol','SentimentScore']
emoticon_data = pd.read_csv('EmoticonLookupTable.txt', delimiter='\t', encoding = 'ISO-8859-1',names=header)
#Writing emoticons to a dictionary
emoji_dict = emoticon_data.groupby('EmoticonSymbol')['SentimentScore'].apply(list).to_dict()

In [12]:
#Getting acronyms and slangs from html page and creating a dictionary
resp = requests.get("http://www.netlingo.com/acronyms.php")
soup = BeautifulSoup(resp.text, "html.parser")
slangdict= {}
key=""
value=""
for div in soup.findAll('div', attrs={'class':'list_box3'}):
  for li in div.findAll('li'):
   for a in li.findAll('a'):
       key =a.text
   value = li.text.split(key)[1]
   slangdict[key]=value
    
#Removing the "-or-" terms in the dictionary and retaining one acronym
for key,value in slangdict.items():
    if "-or-" in value:
       removestring = re.findall('-or-(.*)',value)
       removestring = ''.join(removestring)
       newvalue = value.replace(removestring,'')
       newvalue = newvalue.replace("-or-",'')
       slangdict[key] = newvalue
    elif "-or" in value:
       removestring = re.findall('-or(.*)',value)
       removestring = ''.join(removestring)
       newvalue = value.replace(removestring,'')
       newvalue = newvalue.replace("-or",'')
       slangdict[key] = newvalue
    
key_to_be_replaced = []
for keys in slangdict.keys():
    if " or " in keys:
        key_to_be_replaced.append(keys)

for keys in key_to_be_replaced:
    getkeys = keys.split("or")
    for x in getkeys:
        x = x.strip()
        slangdict[x]= slangdict[keys]
    slangdict.pop(keys,None)   

# store the dictionary
file = open("Slangdictionary.txt",'w',encoding='utf-8')
file.write(str(slangdict))
file.close()

In [6]:
# Reading the slang dictionary that is already created
f = open("Slangdictionary.txt","r")
res1=f.read()
f.close()
slangdict = ast.literal_eval(res1)

In [2]:
# read the training data
header = ['label','comment','parent_comment']
cleaneddata = pd.read_table('clean_data_train_balanced_final.csv',
                    sep='|', 
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [13]:
## function to get list of emojis in a comment
def find_emoji(text):
    return list(x for x in text.split() if x in emoji_dict.keys() )

In [14]:
#method to extract features
def featureextraction(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

In [15]:
# Extracting the features for each comment 
# Punctuation Features and presence of sarcastic symbol and sentiment based features
def allfeatures(user_comment):
    # Punctuation Features 
    if '!' or '.' or '?' in user_comment:
        Numofexclaimations = user_comment.count('!')
        Numofdots = user_comment.count('.')
        Numofquestionmarks = user_comment.count('?')
    else:
        Numofexclaimations = 0
        Numofdots = 0
        Numofquestionmarks = 0
    # Presence of sarcastic symbol
    if '(!)' in user_comment:
        SarcasticSymbol = 1
    else:
        SarcasticSymbol = 0
    
    sentiments = TextBlob(str(user_comment)).sentiment
    polarity = sentiments.polarity
    subjectivity = sentiments.subjectivity
    numofcapitals = sum(x.isupper() for x in user_comment.split() if len(x) > 1 )
    elist = find_emoji(user_comment)
    pscore =0
    nscore = 0
    for item in elist:
        if (emoji_dict[item][0] == 1):
            pscore += 1
        elif (emoji_dict[item][0] == -1):
            nscore += 1
    return Numofexclaimations,Numofdots,Numofquestionmarks,SarcasticSymbol,polarity,subjectivity,numofcapitals,pscore,nscore

In [16]:
helper function to collect number of interjection
def comment_interjection(user_comment):
    count = Counter(tag for word,tag in user_comment)
    return count['UH']

In [22]:
#feature set 1
start_time = time.time() 
featureddataset = featureextraction(cleaneddata, 'comment', allfeatures, ['Numofexclaimations', 'Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount'])
end_time = time.time() 
print("time taken ", end_time-start_time)


feature extraction using POS
start_time = time.time()
txt = cleaneddata['comment'].tolist()
POS tagging for all the tokens in the sentence
tagged_texts = pos_tag_sents(map(word_tokenize, txt))
end_time = time.time()
cleaneddata['POS'] = tagged_texts
print("time taken ", end_time-start_time)

# number of interjection
featureddataset['interjection']  = cleaneddata.POS.apply(comment_interjection)

time taken  954.1302464008331
time taken  566.874009847641


In [23]:
# helper function to get parent comment
def get_parent_sentiment(comment):
    sentiments = TextBlob(str(comment)).sentiment
    polarity = sentiments.polarity
    if polarity >= 0.1:
        return 1
    elif polarity < -0.1:
        return -1
    else:
        return 0

In [24]:

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()
#get heighly emotional words (associated with POS tags)
def get_high_emotion_words(postags):
    highly_pos = 0
    highly_neg = 0
    POS_list = ['JJ','JJR','JJS', 'RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ']
    num_tokens = len(postags)
    for i in range(num_tokens):
        if postags[i][1] in POS_list:            
            #check sentiment of next word
            if i < (num_tokens - 1) :
                senti_word = sia.polarity_scores(postags[i+1][0])
                if senti_word['pos'] == 1:
                    highly_pos += 1
                if senti_word['neg'] == 1:
                    highly_neg += 1
                
    return highly_pos, highly_neg 



In [25]:
# Additional features pos_words, neg_words, flip_count
def get_pos_neg_word_count(tokens):
    
    pos_word_count = 0
    neg_word_count = 0    
    pos_flag = False
    neg_flag = False
    flip_count = 0
        
    for word in tokens:
        senti = sia.polarity_scores(str(word)) 
        if senti["pos"] == 1:
            pos_word_count += 1
            pos_flag = True
            if neg_flag:
                flip_count += 1
                neg_flag =  False
                
        elif senti["neg"] == 1:
            neg_word_count += 1
            neg_flag = True
            if pos_flag:
                flip_count +=1
                pos_flag = False
    return pos_word_count, neg_word_count,flip_count

In [26]:
# feature set 2
start_time = time.time()
#parent comment sentiment
featureddataset['parent_sentiment'] = cleaneddata.parent_comment.apply(get_parent_sentiment)
end_time = time.time()
print("time taken ", end_time-start_time)
#sentiment intensifier
featureddataset['highly_positive'],featureddataset['highly_negative'] = zip(*cleaneddata['POS'].map(get_high_emotion_words))
# sentiment word count
start_time = time.time()
emotion_dataset = featureextraction(featureddataset, 'comment', get_pos_neg_word_count, ['PosWords','NegWords','FlipCount'])
end_time = time.time()
print("time taken ", end_time-start_time)

time taken  487.71743178367615
time taken  931.3737943172455


In [27]:
#glove embedding

from keras.preprocessing.text import Tokenizer
# 
# load the Glove embedding into memory
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# Tokenize the comments
Word_tokenizer = Tokenizer()
Word_tokenizer.fit_on_texts(featureddataset['comment'])
# Word_tokenizer.num_words = 100000
vocab_size = len(Word_tokenizer.word_index) + 1
#encode the train tokens to sequence
sequences = Word_tokenizer.texts_to_sequences(featureddataset['comment'])

# create embedding matrix
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in Word_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


Using TensorFlow backend.


Found 400000 word vectors.


In [28]:
#add embeddings to the comments
emotion_dataset['embedding'] = sequences

In [29]:
import math
#helper to calculate the squareroot of square of the embedding matrix
embedding_square ={}
keys = range(vocab_size)
for i in keys:
    embedding = embedding_matrix[i]
    sum_square = 0
    for j in range(len(embedding)):
        values = embedding[j]
        sum_square += values*values
    embedding_square[i] = math.sqrt(sum_square)
    
file = open("embedding_square_train.txt",'w',encoding='utf-8')
file.write(str(embedding_square))
file.close()

In [30]:

# helper to calculate the cosine similarity between two words
def cosine_similarity(word1,word2,v1,v2):
#     "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxy = 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxy += x*y
    return sumxy/(embedding_square[word1]*embedding_square[word2])

In [31]:
#helper function to calculate the cosine similarity between the words in te comments
def calculate_similarity(comment_token):
    token_array = np.matrix(comment_token)
    comment_len = token_array.shape[1]
    most_similar = least_similar = most_dissimilar = least_dissimilar = 0
    if comment_len > 0:
        mat = np.empty(shape=(comment_len,comment_len))
        mat[:] = np.nan
        for i in range(0,comment_len):
            for j in range(i+1,comment_len):
                a = comment_token[i]
                b = comment_token[j]
                mat[i][j] = cosine_similarity(a,b,embedding_matrix[a],embedding_matrix[b].T)
                mat[j][i] = mat[i][j]
    #get the most similar
        similar_mat = np.nanmax(mat,axis=0)
        most_similar = np.nanmax(similar_mat)
        least_similar = np.nanmin(similar_mat)
    #get the most dissimilar 
        dissimilar_mat = np.nanmin(mat,axis=0)
        most_dissimilar = np.nanmax(dissimilar_mat)
        least_dissimilar = np.nanmin(dissimilar_mat)
        
    return most_similar, least_similar, most_dissimilar, least_dissimilar 

In [32]:
# feature set 3
start_time = time.time()
embedded_dataset = featureextraction(emotion_dataset, 'embedding', calculate_similarity, ['most_similar','least_similar','most_dissimilar','least_dissimilar'])
end_time = time.time()
print("time taken ", end_time-start_time)

  app.launch_new_instance()


time taken  5919.951149225235


In [33]:
values = {'most_similar': 0, 'least_similar': 0, 'most_dissimilar': 0, 'least_dissimilar': 0}
embedded_dataset = embedded_dataset.fillna(value=values)

In [None]:
#Create embedding features with word2vec for training data
from sklearn.metrics.pairwise import cosine_similarity
def allembeddingfeatures(user_comment):
    #user_comment = user_comment.replace("'","")
    result = text_to_word_sequence(user_comment,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ')
    #print(result)
    length = len(result)
    similaritymatrix = np.zeros((length,length))
    i=0
    for word in result:
        wordseq = embeddings_dict[word]
        j=0
        # Similarity Matrix computation
        for word1 in result:
            if similaritymatrix[i][j] == 0:
                wordseq1 = embeddings_dict[word1]
                #print(wordseq.shape)
                #print(wordseq1.shape)
                k = cosine_similarity(wordseq,wordseq1)
                similaritymatrix[i][j] = k
                similaritymatrix[j][i] = k
                j = j+1
            else:
                j = j+1
        i = i + 1
    if similaritymatrix.shape==(0,0):
        maxmostsimilar = 0
        minmostsimilar = 0
        maxmostdissimilar = 0
        minmostdissimilar = 0
    else:    
        # Comuting four features
        np.fill_diagonal(similaritymatrix,0)
        feature1 = similaritymatrix.max(axis=0)
        maxmostsimilar = np.max(feature1)
        minmostsimilar = np.min(feature1)
        np.fill_diagonal(similaritymatrix,1)
        feature2 = similaritymatrix.min(axis=0)
        maxmostdissimilar = np.max(feature2)
        minmostdissimilar = np.min(feature2)
    return maxmostsimilar,minmostsimilar,maxmostdissimilar,minmostdissimilar

In [None]:
#Create embedding features with word2vec for test data
def alltestembeddingfeatures(user_comment):
    #user_comment = user_comment.replace("'","")
    result = text_to_word_sequence(user_comment,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ')
    #print(result)
    length = len(result)
    similaritymatrix = np.zeros((length,length))
    i=0
    for word in result:
        wordseq = embeddings_test_dict[word]
        j=0
        # Similarity Matrix Computation
        for word1 in result:
            if similaritymatrix[i][j] == 0:
                wordseq1 = embeddings_test_dict[word1]
                #print(wordseq.shape)
                #print(wordseq1.shape)
                k = cosine_similarity(wordseq,wordseq1)
                similaritymatrix[i][j] = k
                similaritymatrix[j][i] = k
                j = j+1
            else:
                j = j+1
        i = i + 1
    if similaritymatrix.shape==(0,0):
        maxmostsimilar = 0
        minmostsimilar = 0
        maxmostdissimilar = 0
        minmostdissimilar = 0
    else:    
        #Computing four features
        np.fill_diagonal(similaritymatrix,0)
        feature1 = similaritymatrix.max(axis=0)
        maxmostsimilar = np.max(feature1)
        minmostsimilar = np.min(feature1)
        np.fill_diagonal(similaritymatrix,1)
        feature2 = similaritymatrix.min(axis=0)
        maxmostdissimilar = np.max(feature2)
        minmostdissimilar = np.min(feature2)
    return maxmostsimilar,minmostsimilar,maxmostdissimilar,minmostdissimilar

In [None]:
# Changing the comments to sequence
from keras.preprocessing.text import text_to_word_sequence
def comment_clean1(user_comment):  
    result = text_to_word_sequence(user_comment,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ')
    return result

In [None]:
# Saving the word2vec model created
traindata1=featureddataset
traindata1['comment'] = traindata1.comment.apply(comment_clean1)
usercomment1 = traindata1['comment'].values.tolist()
embedding_dim = 100
model = Word2Vec(usercomment1, size=embedding_dim, window=5,workers=4, min_count=1)
words = list(model.wv.vocab)


In [None]:
#Writing the model created to a file with train data embeddings
file = "word2vec_embedding_train_data.txt"
model.wv.save_word2vec_format(file,binary=False)

In [None]:
# Creating the embedding dictionary with words from training data
import os
embeddings_dict = {}
f = open("word2vec_embedding_train_data.txt", encoding = "utf-8")
i=1
for line in f:
        line = line.replace('\U00002013', '-')
        values = line.split()
        word = values[0]
        value = ' '.join(values[1:])
        val = list(map(float, value.split()))
        coefs = np.array([val])
        #print(coefs.shape)
        coefs = [np.asarray(val)]
        #print(coefs)
        embeddings_dict[word] = coefs
f.close()

In [None]:
# Adding the word2vec embeddings with existing features
embedded_dataset = featureextraction(emotion_dataset, 'comment', allembeddingfeatures, ['Maxmostsimilar','Minmostsimilar','Maxmostdissimilar','Minmostdissimilar'])

In [34]:
#write the cleaned data with features into a csv file
embedded_dataset.to_csv('clean_data_with_all_features.csv',
           sep= '|',
           index=False)

In [35]:
# steps for test data preparation
# read the test data
header = ['label','comment','parent_comment']
testdata = pd.read_table('clean_data_test_balanced_Wparent.csv',
                    sep='|', 
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [36]:
#feature set 1
start_time = time.time() 
featuredset = featureextraction(testdata, 'comment', allfeatures, ['Numofexclaimations', 'Numofdots','Numofquestionmarks','SarcasticSymbol','Polarity', 'Subjectivity','NumofCapitalWords','PositiveEmojiCount','NegativeEmojiCount'])
end_time = time.time() 
print("time taken ", end_time-start_time)
#feature extraction using POS
start_time = time.time()
txt = testdata['comment'].tolist()
#POS tagging for all the tokens in the sentence
tagged_texts = pos_tag_sents(map(word_tokenize, txt))
end_time = time.time()
testdata['POS'] = tagged_texts
print("time taken ", end_time-start_time)

# number of interjection
featuredset['interjection']  = testdata.POS.apply(comment_interjection)

time taken  676.2500035762787
time taken  143.84018182754517


In [37]:
# feature set 2
start_time = time.time()
#parent comment sentiment
featuredset['parent_sentiment'] = testdata.parent_comment.apply(get_parent_sentiment)
end_time = time.time()
print("time taken ", end_time-start_time)
#sentiment intensifier
featuredset['highly_positive'],featuredset['highly_negative'] = zip(*testdata['POS'].map(get_high_emotion_words))
# sentiment word count
start_time = time.time()
test_dataset = featureextraction(featuredset, 'comment', get_pos_neg_word_count, ['PosWords','NegWords','FlipCount'])
end_time = time.time()
print("time taken ", end_time-start_time)

time taken  122.3596031665802
time taken  198.8806071281433


In [38]:
# Tokenize the comments

Word_tokenizer.fit_on_texts(featuredset['comment'])
# Word_tokenizer.num_words = 100000
vocab_size = len(Word_tokenizer.word_index) + 1
#encode the train tokens to sequence
sequences = Word_tokenizer.texts_to_sequences(featuredset['comment'])

# create embedding matrix
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in Word_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [39]:
#add embeddings to the comments
test_dataset['embedding'] = sequences

In [40]:
#helper to calculate the squareroot of square of the embedding matrix
embedding_square ={}
keys = range(vocab_size)
for i in keys:
    embedding = embedding_matrix[i]
    sum_square = 0
    for j in range(len(embedding)):
        values = embedding[j]
        sum_square += values*values
    embedding_square[i] = math.sqrt(sum_square)
    
file = open("embedding_square_test.txt",'w',encoding='utf-8')
file.write(str(embedding_square))
file.close()

In [41]:
# feature set 3
start_time = time.time()
embedded_testset = featureextraction(test_dataset, 'embedding', calculate_similarity, ['most_similar','least_similar','most_dissimilar','least_dissimilar'])
end_time = time.time()
print("time taken ", end_time-start_time)

  app.launch_new_instance()


time taken  1327.6528935432434


In [42]:
values = {'most_similar': 0, 'least_similar': 0, 'most_dissimilar': 0, 'least_dissimilar': 0}
embedded_testset = embedded_testset.fillna(value=values)

In [None]:
# Creating the word2vec model for test data
testdata1 = test_dataset
testdata1['comment'] = testdata1.comment.apply(comment_clean1)
usercomment10 = testdata1['comment'].values.tolist()
model = Word2Vec(usercomment10, size=embedding_dim, window=5,workers=4, min_count=1)
words = list(model.wv.vocab)

In [None]:
# Saving the model to a file
file = "word2vec_embedding_test_data.txt"
model.wv.save_word2vec_format(file,binary=False)

In [None]:
# Creating embeddings dictionary for test data
embeddings_test_dict = {}
f = open("word2vec_embedding_test_data.txt", encoding = "utf-8")
i=1
for line in f:
        line = line.replace('\U00002013', '-')
        values = line.split()
        word = values[0]
        value = ' '.join(values[1:])
        val = list(map(float, value.split()))
        coefs = np.array([val])
        #print(coefs.shape)
        coefs = [np.asarray(val)]
        #print(coefs)
        embeddings_test_dict[word] = coefs
f.close()

In [None]:
# Extracting the four embedding features of word2vec
embedded_testset = featureextraction(test_dataset, 'comment', alltestembeddingfeatures, ['Maxmostsimilar','Minmostsimilar','Maxmostdissimilar','Minmostdissimilar'])

In [43]:
#write the cleaned data with features into a csv file
embedded_testset.to_csv('clean_testdata_with_all_features.csv',
           sep= '|',
           index=False)

In [44]:
# load the train data with features
traindata_withfeature = pd.read_table('clean_data_with_all_features.csv',
                    sep='|', 
                   # delimiter=',',
#                     usecols=[0,1,2],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

# load the test data with features
testdata_withfeature = pd.read_table('clean_testdata_with_all_features.csv',
                    sep='|', 
                   # delimiter=',',
#                     usecols=[0,1,2],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [45]:
# drop embedding
traindata_withfeature = traindata_withfeature.drop(columns=['embedding'])

testdata_withfeature = testdata_withfeature.drop(columns=['embedding'])

In [46]:
# drop parent sentiment
# parent_sentiment
traindata_Wparent = traindata_withfeature
traindata_Wparent = traindata_Wparent.drop(columns=['parent_sentiment'])

testdata_Wparent = testdata_withfeature
testdata_Wparent = testdata_Wparent.drop(columns=['parent_sentiment'])

In [69]:
# drop comments with comment length < 2
traindata = traindata_withfeature
# droped_train = traindata.where(traindata['comment'].str.split().str.len()>1)
droped_train = traindata[traindata['comment'].str.split().str.len()>2]


In [70]:
testdata = testdata_withfeature
droped_test = testdata[testdata['comment'].str.split().str.len()>2]
droped_test.shape
# testdata_withfeature.shape

(226169, 23)

In [None]:
# drop new added features(5)
traindata_parent = traindata_withfeature
traindata_parent = traindata_parent.drop(columns=['highly_positive','highly_negative','PosWords','NegWords','FlipCount'])

testdata_parent = testdata_withfeature
testdata_parent = testdata_parent.drop(columns=['highly_positive','highly_negative','PosWords','NegWords','FlipCount'])

In [None]:
# with only embeddings
traindata = traindata_withfeature
traindata = traindata.drop(columns=['parent_sentiment','highly_positive','highly_negative','PosWords','NegWords','FlipCount'])

testdata = testdata_withfeature
testdata = testdata.drop(columns=['parent_sentiment','highly_positive','highly_negative','PosWords','NegWords','FlipCount'])

In [71]:
#train data features
newtrain = pd.DataFrame(droped_train.iloc[:, 3:])
#train data labels
targetlabel = droped_train.iloc[:,0]

In [72]:
# test data features
newtest = pd.DataFrame(droped_test.iloc[:, 3:])
#test data labels
testlabel = droped_test.iloc[:,0]

In [73]:
# gradient boosting algorithm
start_time = time.time()
gradient_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=1)
gradient_clf.fit(newtrain,targetlabel)
end_time = time.time()
print("time taken ", end_time-start_time)

time taken  53.68022584915161


In [74]:
gradient_predictions = gradient_clf.predict(newtest)

print("Confusion Matrix")
print(confusion_matrix(testlabel, gradient_predictions))
print("Classification Report")
print(classification_report(testlabel, gradient_predictions))

Confusion Matrix
[[69167 39977]
 [50156 66869]]
Classification Report
              precision    recall  f1-score   support

           0       0.58      0.63      0.61    109144
           1       0.63      0.57      0.60    117025

   micro avg       0.60      0.60      0.60    226169
   macro avg       0.60      0.60      0.60    226169
weighted avg       0.60      0.60      0.60    226169



In [67]:
#random forest

from sklearn.ensemble import RandomForestClassifier 

start_time = time.time()
random_clf = RandomForestClassifier(n_jobs=2, random_state=0)
random_clf.fit(newtrain,targetlabel)
end_time = time.time()
print("time taken ", end_time-start_time)




time taken  23.603333711624146


In [68]:

random_predictions = random_clf.predict(newtest)

print("Confusion Matrix")
print(confusion_matrix(testlabel, random_predictions))
print("Classification Report")
print(classification_report(testlabel, random_predictions))

Confusion Matrix
[[75569 39622]
 [57933 63068]]
Classification Report
              precision    recall  f1-score   support

           0       0.57      0.66      0.61    115191
           1       0.61      0.52      0.56    121001

   micro avg       0.59      0.59      0.59    236192
   macro avg       0.59      0.59      0.59    236192
weighted avg       0.59      0.59      0.59    236192



In [None]:

from sklearn.svm import SVC  
start_time = time.time()
# linear kernel
svclassifier = SVC(kernel='linear',C=1,gamma=1)  
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test)  

end_time = time.time()
print("time taken ", end_time-start_time)
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))

# Guassian kernel
start_time = time.time()
svclassifier = SVC(kernel='rbf',C=1,gamma=1)  
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test)  
end_time = time.time()
print("time taken ", end_time-start_time)
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))

# polynomial kernel with degree 3
start_time = time.time()
svclassifier = SVC(kernel='poly', degree=3) 
svclassifier.fit(X_train, y_train) 

y_pred = svclassifier.predict(X_test) 
end_time = time.time()
print("time taken ", end_time-start_time)

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))
