In [1]:
import sqlite3
from sklearn import metrics
from scipy.sparse import hstack
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm 
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns

In [2]:
cnx = sqlite3.connect('DebateOrg/ddo-V1.db')
c = cnx.cursor()
#debates title with users and category
debates = pd.read_sql_query('SELECT participant_1_link,category,url,title, participant_1_position FROM debates ', cnx)
#users
users = pd.read_sql_query('SELECT * FROM users', cnx)
#selecting only the useful columns like party,religion, ethnicity etc
users=users[['url','ideology','party','gender','ethnicity','religion','win_ratio']]
interval = re.compile('^(\d+\.\d+)\%$')

def read(elem):
    m=interval.search(elem)
    if m:
        num = float(m.group(1))
        return((num)/100)
users['win_ratio']=users['win_ratio'].apply(read)
#deb_users=pd.merge(debates, users, how='left', left_on='participant_1_link', right_on='url')
#deb_users=deb_users[['category','title','url_x','url_y','ideology','party','gender','ethnicity','religion','participant_1_position','win_ratio']]
#deb_users=deb_users.rename(columns=({'url_y':'username', 'url_x':'url', 'participant_1_position':'user_position'}))
username = re.compile('^.(\w+).$')

def find_username(elem):
    m=username.search(str(elem))
    if m:
        un = m.group(1)
        return un
    
debates['username']=debates['participant_1_link'].apply(find_username)
users["url"]=users["url"].apply(find_username)

In [3]:
users=users[users["religion"]!="Not Saying"]
users=users[users["party"]!="Not Saying"]
users=users[users["ideology"]!="Not Saying"]
users=users[users["ethnicity"]!="Not Saying"]
users=users[users["gender"]!="Prefer not to say"]

In [4]:
def groupping_gender(x):
    #gender
    if x in ("Agender","Androgyne","Bigender","Genderqueer","Transgender Female","Transgender Male"):
        x = "LGBTQIAPK+" 
    return x 
def groupping_ideol(x):    
    #ideology
    if x in ("Apathetic","Undecided","Other"):
        x="Other"
    elif x in ("Anarchist","Communist","Green","Socialist"):
        x="Left"
    elif x in ("Labor","Moderate","Progressive"):
        x="Center"
    else:
        x= "Right"
    return x

def groupping_religion(x):
    if x in ("Agnostic","Atheist"):
        x=x
    elif x in ("Christian - Catholic", "Christian","Christian - Mennonite", "Christian - Amish","Christian - Episcopalian","Christian - Jehovah's Witness","Christian - Seventh-Day Adventist","Christian - Assemblies of God","Christian - Anglican","Christian - Greek Orthodox","Christian - Presbytarian","Christian - Pentecosta","Christian - Methodist","Christian - Church of Christ","Christian - Lutheran","Christian - Latter-Day Saints","Christian - Protestant","Christian - Baptist"):
        x="Christian"
    elif x in ("Muslim - Sufi","Muslim - Shiite","Islamic","Muslim - Sunni","Muslim"):
        x="Muslim"
    else:
        x="Other"
    return x

position = re.compile('^([A-Z][a-z]+)\s.(\w+).$')

def pos(elem):
    m=position.search(str(elem))
    if m:
        un = m.group(2)
        return un

In [5]:
users["religion"]=users["religion"].apply(groupping_religion)
users["gender"]=users["gender"].apply(groupping_gender)
users["ideology"]=users["ideology"].apply(groupping_ideol)


In [6]:
users=users.reset_index()
users=users[["url","ideology","party","gender","ethnicity","religion","win_ratio"]]

In [7]:
users

Unnamed: 0,url,ideology,party,gender,ethnicity,religion,win_ratio
0,Mikal,Right,Independent,Male,White,Agnostic,0.9684
1,bluesteel,Right,Republican Party,Female,White,Christian,0.9811
2,Kleptin,Other,Other,Male,Asian,Agnostic,0.9448
3,tejretics,Right,Democratic Party,Male,East Indian,Atheist,0.9531
4,thett3,Right,Independent,Female,White,Christian,0.9133
...,...,...,...,...,...,...,...
4163,,Left,American Nazi Party,Male,White,Christian,0.1587
4164,FanboyMctroll,Left,Pirate Party,Male,White,Atheist,0.1379
4165,jp_porwisz10,Right,Republican Party,Male,White,Christian,0.0909
4166,hd1997,Other,Democratic Party,Female,White,Other,0.1667


In [8]:
debates

Unnamed: 0,participant_1_link,category,url,title,participant_1_position,username
0,/Mikal/,Miscellaneous,/debates/Covenys-recent-behavior-would-be-a-ne...,Covenys recent behavior would be a net detrime...,Pro (for),Mikal
1,/Mikal/,Politics,/debates/Islam-is-a-civilized-religion/1/,Islam is a civilized religion,Con (against),Mikal
2,/Mikal/,Games,/debates/Gaming-is-a-sport/4/,Gaming is a sport,Pro (for),Mikal
3,/JimShady/,Music,/debates/Rap-Battle-of-Amazingness-13/1/,"Rap Battle of Amazingness, 13",Con (against),JimShady
4,/Mikal/,Miscellaneous,/debates/Racial-Religious-Profiling-Is-a-Ratio...,Racial/Religious Profiling Is a Rational Polit...,Pro (for),Mikal
...,...,...,...,...,...,...
81794,/backwardseden/,Religion,/debates/Young-Boy-Found-at-a-New-Mexico-Compo...,Young Boy Found at a New Mexico Compound Died ...,Con (against),backwardseden
81795,/Thomasmariel33/,Politics,/debates/When-governments-employ-the-term-stri...,When governments employ the term strike (as in...,Pro (for),Thomasmariel33
81796,/Thomasmariel33/,Society,/debates/Should-the-words-poor-and-homeless-be...,Should the words poor and homeless be eradicat...,Pro (for),Thomasmariel33
81797,/Thomasmariel33/,Movies,/debates/Does-the-voice-calling-Jesses-name-as...,"Does the voice, Calling Jesse's name as she sl...",Pro (for),Thomasmariel33


In [9]:
arguments=pd.read_sql_query('SELECT * FROM arguments',cnx)
arguments['user']=arguments['user'].apply(find_username)

In [10]:
arg=pd.merge(arguments,debates, left_on="debate",right_on="url", how="left")

In [11]:
arg.columns

Index(['debate', 'text', 'user', 'side', 'round_number', 'participant_1_link',
       'category', 'url', 'title', 'participant_1_position', 'username'],
      dtype='object')

In [12]:
arg_1=arg[['text', 'user','side','category', 'title']]

In [13]:
users.columns

Index(['url', 'ideology', 'party', 'gender', 'ethnicity', 'religion',
       'win_ratio'],
      dtype='object')

In [63]:
arg_2=pd.merge(arg_1, users, left_on="user",right_on="url",how="left")


In [64]:
arg_2=arg_2.dropna().reset_index()

In [65]:
arg_2["features"]=arg_2['ideology']+' '+arg_2['party']+' '+arg_2['gender']+' '+arg_2['ethnicity']+' '+arg_2['religion']

In [66]:
arg_2.columns

Index(['index', 'text', 'user', 'side', 'category', 'title', 'url', 'ideology',
       'party', 'gender', 'ethnicity', 'religion', 'win_ratio', 'features'],
      dtype='object')

In [67]:
arg_2

Unnamed: 0,index,text,user,side,category,title,url,ideology,party,gender,ethnicity,religion,win_ratio,features
0,0,Some context on the resolution. Normally I wou...,Mikal,Pro,Miscellaneous,Covenys recent behavior would be a net detrime...,Mikal,Right,Independent,Male,White,Agnostic,0.9684,Right Independent Male White Agnostic
1,2,We can dismiss most everything in his last rou...,Mikal,Pro,Miscellaneous,Covenys recent behavior would be a net detrime...,Mikal,Right,Independent,Male,White,Agnostic,0.9684,Right Independent Male White Agnostic
2,4,On to rebuttals1) IntroI'm not sure what he is...,Mikal,Pro,Miscellaneous,Covenys recent behavior would be a net detrime...,Mikal,Right,Independent,Male,White,Agnostic,0.9684,Right Independent Male White Agnostic
3,6,This is a debate challenge to Cassie on a foru...,Mikal,Con,Politics,Islam is a civilized religion,Mikal,Right,Independent,Male,White,Agnostic,0.9684,Right Independent Male White Agnostic
4,8,Resending this to Zaro as this was intended fo...,Mikal,Con,Politics,Islam is a civilized religion,Mikal,Right,Independent,Male,White,Agnostic,0.9684,Right Independent Male White Agnostic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94713,5908627,Don't dismiss my position as a non-argument. Y...,Mikegj1077,Con,Health,Gov. funded health care,Mikegj1077,Right,Republican Party,Male,White,Christian,0.0556,Right Republican Party Male White Christian
94714,5908629,"""UK - booming economy, Nationalized Health Ser...",Mikegj1077,Con,Health,Gov. funded health care,Mikegj1077,Right,Republican Party,Male,White,Christian,0.0556,Right Republican Party Male White Christian
94715,5912374,"Communism was terrible, It made people equally...",billsands,Pro,Politics,The USSR could have survived if it has adopted...,billsands,Left,Socialist Party,Male,Other,Agnostic,0.3125,Left Socialist Party Male Other Agnostic
94716,5912376,Where did i state marx was right about everyth...,billsands,Pro,Politics,The USSR could have survived if it has adopted...,billsands,Left,Socialist Party,Male,Other,Agnostic,0.3125,Left Socialist Party Male Other Agnostic


In [68]:
arg_2=arg_2[['text', 'user', 'side', 'category', 'title', 'url', 'ideology','party', 'gender', 'ethnicity', 'religion', 'win_ratio','features']]

In [69]:
lol=arg_2


In [70]:
#Pre processing text
# Step - a : Remove blank rows if any.
lol['features'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
lol['features'] = [entry.lower() for entry in lol['features']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
lol['features']= [word_tokenize(entry) for entry in lol['features']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(lol['features']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        #print(word,tag)
        ## Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'title_final'
    lol.loc[index,'features_final'] = str(Final_words)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lol['features'].dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lol['features'] = [entry.lower() for entry in lol['features']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lol['features']= [word_tokenize(entry) for entry in lol['features']]


In [71]:
#Pre processing text
# Step - a : Remove blank rows if any.
lol['title'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
lol['title'] = [entry.lower() for entry in lol['title']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
lol['title']= [word_tokenize(entry) for entry in lol['title']]


In [None]:
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(lol['title']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        #print(word,tag)
        ## Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'title_final'
    lol.loc[index,'title_final'] = str(Final_words)

In [73]:
# Step - a : Remove blank rows if any.
lol['text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
lol['text'] = [entry.lower() for entry in lol['text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
lol['text']= [word_tokenize(entry) for entry in lol['text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(lol['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'title_final'
    lol.loc[index,'text_final'] = str(Final_words)

In [74]:
lol.to_csv("tokenized.csv")

In [75]:
np.random.seed(500)

In [76]:
lol=lol.dropna()

In [77]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(lol[["title_final",'text_final',"side"]],lol['religion'],test_size=0.3)

In [78]:
Encoder = LabelEncoder()
Train_Y_1 = Encoder.fit_transform(Train_Y)
Test_Y_1 = Encoder.fit_transform(Test_Y)


In [79]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(lol["title_final"])
Train_X_Tfidf_title = Tfidf_vect.transform(Train_X["title_final"])
Test_X_Tfidf_title = Tfidf_vect.transform(Test_X["title_final"])


In [80]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(lol["text_final"])
Train_X_Tfidf_text = Tfidf_vect.transform(Train_X["text_final"])
Test_X_Tfidf_text = Tfidf_vect.transform(Test_X["text_final"])


In [81]:
Train_X_1_pos = Encoder.fit_transform(Train_X['side'])
Test_X_1_pos = Encoder.fit_transform(Test_X['side'])


In [82]:
mat_train= hstack([Train_X_Tfidf_title,Train_X_Tfidf_text])
X_train_tfidf = hstack([mat_train, Train_X_1_pos.reshape(-1, 1)])

In [83]:
mat_test= hstack([Test_X_Tfidf_title,Test_X_Tfidf_text])
X_test_tfidf = hstack([mat_test, Test_X_1_pos.reshape(-1, 1)])

In [84]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_tfidf,Train_Y_1)

SVC(gamma='auto', kernel='linear')

In [86]:
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test_tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y_1)*100)

SVM Accuracy Score ->  53.948479729729726


In [87]:
print(metrics.classification_report(Test_Y_1, predictions_SVM,target_names=["Agnostic","Atheist","Christian","Muslim","Other"]))

              precision    recall  f1-score   support

    Agnostic       0.50      0.34      0.41      4205
     Atheist       0.52      0.48      0.50      7606
   Christian       0.56      0.76      0.65     11422
      Muslim       0.77      0.32      0.45       468
       Other       0.52      0.29      0.37      4715

    accuracy                           0.54     28416
   macro avg       0.57      0.44      0.47     28416
weighted avg       0.53      0.54      0.52     28416



In [88]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(lol[["title_final",'text_final',"side"]],lol['ideology'],test_size=0.3)
Encoder = LabelEncoder()
Train_Y_1 = Encoder.fit_transform(Train_Y)
Test_Y_1 = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(lol["title_final"])
Train_X_Tfidf_title = Tfidf_vect.transform(Train_X["title_final"])
Test_X_Tfidf_title = Tfidf_vect.transform(Test_X["title_final"])
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(lol["text_final"])
Train_X_Tfidf_text = Tfidf_vect.transform(Train_X["text_final"])
Test_X_Tfidf_text = Tfidf_vect.transform(Test_X["text_final"])
Train_X_1_pos = Encoder.fit_transform(Train_X['side'])
Test_X_1_pos = Encoder.fit_transform(Test_X['side'])
mat_train= hstack([Train_X_Tfidf_title,Train_X_Tfidf_text])
X_train_tfidf = hstack([mat_train, Train_X_1_pos.reshape(-1, 1)])
mat_test= hstack([Test_X_Tfidf_title,Test_X_Tfidf_text])
X_test_tfidf = hstack([mat_test, Test_X_1_pos.reshape(-1, 1)])

In [89]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_tfidf,Train_Y_1)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test_tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y_1)*100)

SVM Accuracy Score ->  62.39794481981982


In [90]:
print(metrics.classification_report(Test_Y_1, predictions_SVM,target_names=["Center","Left","Center","Right"]))

              precision    recall  f1-score   support

      Center       0.54      0.19      0.28      3965
        Left       0.61      0.24      0.34      3380
      Center       0.60      0.27      0.37      5211
       Right       0.63      0.93      0.75     15860

    accuracy                           0.62     28416
   macro avg       0.59      0.41      0.44     28416
weighted avg       0.61      0.62      0.57     28416



In [91]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(lol[["title_final",'text_final',"side"]],lol['gender'],test_size=0.3)
Encoder = LabelEncoder()
Train_Y_1 = Encoder.fit_transform(Train_Y)
Test_Y_1 = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(lol["title_final"])
Train_X_Tfidf_title = Tfidf_vect.transform(Train_X["title_final"])
Test_X_Tfidf_title = Tfidf_vect.transform(Test_X["title_final"])
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(lol["text_final"])
Train_X_Tfidf_text = Tfidf_vect.transform(Train_X["text_final"])
Test_X_Tfidf_text = Tfidf_vect.transform(Test_X["text_final"])
Train_X_1_pos = Encoder.fit_transform(Train_X['side'])
Test_X_1_pos = Encoder.fit_transform(Test_X['side'])
mat_train= hstack([Train_X_Tfidf_title,Train_X_Tfidf_text])
X_train_tfidf = hstack([mat_train, Train_X_1_pos.reshape(-1, 1)])
mat_test= hstack([Test_X_Tfidf_title,Test_X_Tfidf_text])
X_test_tfidf = hstack([mat_test, Test_X_1_pos.reshape(-1, 1)])

In [92]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_tfidf,Train_Y_1)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test_tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y_1)*100)

SVM Accuracy Score ->  88.66835585585585


In [93]:
print(metrics.classification_report(Test_Y_1, predictions_SVM,target_names=["Female","LGBTQIAPK+","Male"]))

              precision    recall  f1-score   support

      Female       0.70      0.06      0.12      3080
  LGBTQIAPK+       0.90      0.23      0.37       330
        Male       0.89      1.00      0.94     25006

    accuracy                           0.89     28416
   macro avg       0.83      0.43      0.47     28416
weighted avg       0.87      0.89      0.84     28416



In [94]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(lol[["title_final",'text_final',"side"]],lol['ethnicity'],test_size=0.3)
Encoder = LabelEncoder()
Train_Y_1 = Encoder.fit_transform(Train_Y)
Test_Y_1 = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(lol["title_final"])
Train_X_Tfidf_title = Tfidf_vect.transform(Train_X["title_final"])
Test_X_Tfidf_title = Tfidf_vect.transform(Test_X["title_final"])
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(lol["text_final"])
Train_X_Tfidf_text = Tfidf_vect.transform(Train_X["text_final"])
Test_X_Tfidf_text = Tfidf_vect.transform(Test_X["text_final"])
Train_X_1_pos = Encoder.fit_transform(Train_X['side'])
Test_X_1_pos = Encoder.fit_transform(Test_X['side'])
mat_train= hstack([Train_X_Tfidf_title,Train_X_Tfidf_text])
X_train_tfidf = hstack([mat_train, Train_X_1_pos.reshape(-1, 1)])
mat_test= hstack([Test_X_Tfidf_title,Test_X_Tfidf_text])
X_test_tfidf = hstack([mat_test, Test_X_1_pos.reshape(-1, 1)])

In [95]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_tfidf,Train_Y_1)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test_tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y_1)*100)

SVM Accuracy Score ->  78.2516891891892


In [96]:
print(metrics.classification_report(Test_Y_1, predictions_SVM,target_names=['Asian','Black','Latino','Middle Eastern','Native American','Other','Pacific Islander','White']))

ValueError: Number of classes, 9, does not match size of target_names, 8. Try specifying the labels parameter