<h2>Classification</h2>

Authors: Casper Smit and Samantha Visbeek

In [1]:
# imports
import pandas as pd
import numpy as np
from import_data import *
from util import *


[nltk_data] Downloading package wordnet to /home/samantha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# create dataframe
df = import_merge()
df = clean_dataframe(df)

# drop rows where hdi_score is Nan
df = df[df['hdi_score'].notna()]
df = df.reset_index(drop=True)
df.head()

  0%|          | 0/8 [00:00<?, ?it/s]

Country Cleaning Report:
	933 values cleaned (11.0%)
	37 values unable to be parsed (0.44%), set to NaN
Result contains 8444 (99.56%) values in the correct format and 37 null values (0.44%)


Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,...,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS),Developed / Developing Countries,Session,Year,Speech,hdi_score,hdi_class,country_cleaned
0,1.0,World,142.0,Asia,34.0,Southern Asia,,,Afghanistan,4.0,...,x,x,,Developing,45,1990,"﻿Allow me, first of all, Sir, to congratulate ...",0.302,low,Afghanistan
1,1.0,World,150.0,Europe,39.0,Southern Europe,,,Albania,8.0,...,,,,Developed,45,1990,﻿It is a special pleasure for me to speak at t...,0.65,medium,Albania
2,1.0,World,142.0,Asia,145.0,Western Asia,,,United Arab Emirates,784.0,...,,,,Developing,45,1990,"﻿\nMr. President, on behalf of the delegation ...",0.723,high,United Arab Emirates
3,1.0,World,19.0,Americas,419.0,Latin America and the Caribbean,5.0,South America,Argentina,32.0,...,,,,Developing,45,1990,"﻿At the outset, let me convey to you, Sir, my ...",0.718,high,Argentina
4,1.0,World,9.0,Oceania,53.0,Australia and New Zealand,,,Australia,36.0,...,,,,Developed,45,1990,"﻿It is with great pleasure. Sir, that I congra...",0.871,very high,Australia


<h3>Text Processing</h3>


In [3]:
# remove all names from the speeches
df['SpeechNoNames'] = df['Speech'].apply(remove_names)

# text processing
df['Speech'] = df['Speech'].apply(clean_string).apply(remove_stopw)
df['SpeechNoNames'] = df['SpeechNoNames'].apply(clean_string).apply(remove_stopw)

In [4]:
# remove all words that only occur once in the whole dataframe
df['Speech'] = remove_single_occurence(df['Speech'])
df['SpeechNoNames'] = remove_single_occurence(df['SpeechNoNames'])

In [5]:
# convert list to string
df['Speech'] = df['Speech'].apply(lambda x: ' '.join(x))
df['SpeechNoNames'] = df['SpeechNoNames'].apply(lambda x: ' '.join(x))

In [6]:
# make dataframes of the tf-idf of the words in each speech
df_tfidf = tf_idf(df['Speech'])
df_tfidfNoNames = tf_idf(df['SpeechNoNames'])



In [7]:
# get the features and the target for the classification
features = df_tfidf
featuresNoNames = df_tfidfNoNames

target = df['hdi_class']

In [8]:
# create the logistic regression model and predict
pred, classes, weights, accuracy, precision, recall = logreg(features, target)
predNN, classesNN, weightsNN, accuracyNN, precisionNN, recallNN = logreg(featuresNoNames, target)

In [9]:
print('Accuracy logistic regression model: ', accuracy)
print('Accuracy logistic regression model without names: ', accuracyNN)

print('Precision logistic regression model: ', precision)
print('Precision logistic regression model without names: ', precisionNN)

print('Recall logistic regression model: ', recall)
print('Recall logistic regression model without names: ', recallNN)

Accuracy logistic regression model:  0.8120603015075377
Accuracy logistic regression model without names:  0.7065326633165829
Precision logistic regression model:  [0.75319149 0.89705882 0.73705179 0.85232068]
Precision logistic regression model without names:  [0.62173913 0.80656934 0.61445783 0.76859504]
Recall logistic regression model:  [0.74369748 0.89377289 0.74297189 0.85957447]
Recall logistic regression model without names:  [0.60084034 0.80952381 0.61445783 0.79148936]


In [17]:

# import sys
# import util
# del sys.modules["util"]
# from util import *

n = 5

# calcualte the top n most distincive words for each class
class_words_dict = top_n_distinctive_words(n, df_tfidf, classes, weights)
class_words_dictNN = top_n_distinctive_words(n, df_tfidfNoNames, classesNN, weightsNN)

[nltk_data] Downloading package wordnet to /home/samantha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# print the top n most distinctive words for classification
print('With names:')
display_words(class_words_dict)
print('Without names:')
display_words(class_words_dictNN)

With names:
high
lebanese, carib, sexually, kutesas, mexicans


low
yeltsins, paperwork, africa, guilty, afresh


medium
philippine, guy, fw, namgyel, egregious


very high
zeal, humanistic, janus, bahama, europe


Without names:
high
natal, aggressed, region, slurs, islamic


low
election, partnering, subprime, pays, delegating


medium
democrat, soccer, drudgery, graduating, penury


very high
valued, necks, humanistic, responsibilities, rightness


