In [1]:
#SVM English Data
import os
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import pickle
import matplotlib.pyplot as plt
import logging
import sys

In [2]:
# logger file name
log_file_name = sys.argv[0].split('.')[0]

## SETTING LOGGER
logging.basicConfig(filename=os.path.join(os.getcwd(),log_file_name+'.txt'),filemode='w',
                    level=logging.INFO, format='%(asctime)s - %(message)s')

logging.info("### SVM English Data.\n")

In [3]:
combined_data_path = 'combined_data.pkl'
label = 'en-de-label.pkl'

In [4]:
## GETTING PICKLED DATA
def unpickle(path):
    """ returns data after unpickling the data
    """
    with open(path, 'rb') as readhandle:
        toReturn = pickle.load(readhandle)

    return toReturn


def save_pickle(path, obj, name):
    """saves the given python objects serially
    """
    with open(os.path.join(path, name)+'.pkl', 'wb') as writehandle:
        pickle.dump(obj, writehandle)


data = unpickle(combined_data_path)
label = unpickle(label)

In [12]:
# numbering labels
num_label = dict(list(enumerate(sorted(set(label)))))

In [13]:
num_label

{0: 'agriculture',
 1: 'audiovisual_and_media',
 2: 'budget',
 3: 'competition',
 4: 'consumers',
 5: 'culture',
 6: 'customs',
 7: 'development',
 8: 'economic_and_monetary_affairs',
 9: 'education_training_youth',
 10: 'employment_and_social_policy',
 11: 'energy',
 12: 'enlargement',
 13: 'enterprise',
 14: 'environment',
 15: 'external_relations',
 16: 'external_trade',
 17: 'fight_against_fraud',
 18: 'food_safety',
 19: 'foreign_and_security_policy',
 20: 'human_rights',
 21: 'humanitarian_aid',
 22: 'information_society',
 23: 'institutional_affairs',
 24: 'internal_market',
 25: 'justice_freedom_security',
 26: 'maritime_affairs_and_fisheries',
 27: 'public_health',
 28: 'regional_policy',
 29: 'research_innovation',
 30: 'taxation',
 31: 'transport'}

In [15]:
train_set, test_set, train_label, test_label = train_test_split(data, label, test_size=0.2, random_state=13)

In [21]:
# splitting
en_train_data = []
en_train_label = []
de_train_data = []
de_train_label= []
for i,j in zip(train_set, train_label):
    en_train_data.append(i.split('\n\n\n')[0])
    de_train_data.append(i.split('\n\n\n')[1])
    # getting the label number of corresponding label
    for key,value in num_label.items():
        if value == j:
            num=key
    en_train_label.append(num)
    de_train_label.append(num)

In [22]:
len(en_train_data), len(en_train_label), len(de_train_data), len(de_train_label)

(2172, 2172, 2172, 2172)

In [24]:
en_train_label[1], en_train_data[1]

(10,
 'european recovery drive european recovery summary of european-commission communication com final drive european recovery what do this european-commission communication do pron set series measure take trigger recovery european-union eu follow financial-crisis start summer intensify late key point the communication present ambitious programme aim at restore maintain stability financial sector the report present de larosière group pdf make supervision cornerstone stable financial system the european-commission intend establish supervisory framework detect potential risk relate financial-market early mean of eu-body oversee stability financial system whole european financial-supervision system security also integral part future european regulation to end commission make plan for legislative instrument establish regulatory supervisory standard hedge fund private-equity white paper tool early intervention prevent possible crisis report derivative complex structured product increase tr

In [25]:
de_train_label[1], de_train_data[1]

(10,
 'impuls fuer der aufschwung europa impuls fuer der aufschwung europa zusammenfassung des dokuments mitteilung der europaeischen kommission kom endgueltig impuls fuer der aufschwung europa was ist der zweck dieser mitteilung der europaeischen kommission ich erlaeutert einen reihen von massnahmen zur ankurbelung-der der europaeischen union eu der folge der im sommer begonnen und sich enden intensivierenden krise ergreifen werden muessen wichtige eckpunkte der mitteilung enthaelt einen ehrgeizig programm zur wiederherstellung und aufrechterhaltung ein stabil finanzsektors der bericht der delarosièregruppe zufolge pdf bilden der aufsicht ein eckpfeiler fuer einen stabil finanzsystem der europaeische kommission planen der schaffung ein aufsichtsrahmens zur frueherkennung potenzieller risiko der finanzmaerkte dieser umfasst einen europaeische einrichtung der ueber der stabilitaet der finanzsystems insgesamt wachen soll einen europaeische finanzaufsicht der sicherheit musste ebenso best

In [30]:
en_test_data = []
en_test_label = []
de_test_data = []
de_test_label= []
for i,j in zip(test_set, test_label):
    en_test_data.append(i.split('\n\n\n')[0])
    de_test_data.append(i.split('\n\n\n')[1])
    for key,value in num_label.items():
        if value == j:
            num=key
    en_test_label.append(num)
    de_test_label.append(num)

In [31]:
len(en_test_data), len(en_test_label), len(de_test_data), len(de_test_label)

(544, 544, 544, 544)

In [34]:
en_test_data[0], en_test_label[0]

('expert traffic human beings group expert traffic human beings the group expert responsible advise commission matter relate traffic human being exist since pron composition operation regularly adjust accordance development take place field within european-union eu-act commission-decision eu august set group experts traffic human beings repeal decision summary this decision establish group expert traffic human beings responsible advise european-commission anti traffic matter the group main task be provide commission write contribution matter relate traffic human being ensure coherent approach subject help commission assess evolution policy field national european international level identify possible measure provide forum discussion matter relate traffic human being the group shall compose fifteen member appoint commission four year basis call application pron member shall individual expertise experience prevention fight traffic human being pron shall citizen member state european-unio

In [36]:
de_test_data[0], de_test_label[0]

('sachverstaendigengruppe fuer menschenhandel sachverstaendigengruppe fuer menschenhandel der sachverstaendigengruppe der der kommission alle frage zum thema menschenhandel beraten soll bestehen seit mein zusammensetzung und mein arbeitsweise werden regelmaessig der entwicklung dies bereich innerhalb der europaeischen union eu angepasst rechtsakt beschluss eu der kommission vom august zur einsetzung der sachverstaendigengruppe fuer menschenhandel und zur aufhebung der beschluß eg abl vom zusammenfassung mit dies beschluss werden einen sachverstaendigengruppe fuer menschenhandel einsetzen der der europaeische kommission alle frage zum thema menschenhandel beraten soll der gruppe haben insbesondere folgend aufgeben ich legen der kommission schriftlich stellungnahme zum thema menschenhandel vor und gewaehrleistet einen kohaerentes vorgehen dies frage ich unterstuetzt der kommission bei der bewertung der politik dies bereich auf national europaeischer und international ebene sowie bei der 

## English Language

In [37]:
# creating tf-idf matrix
print('\nTF-IDF Creation Starting for English Language')
logging.info('TF-IDF Creation Starting')

vectorizer = TfidfVectorizer(sublinear_tf=True ,min_df=5,max_features=10000 ,norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

# Fitting the vectorizer on train data from both the clusters
vectorizer.fit(en_train_data, en_test_data)


TF-IDF Creation Starting for English Language


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='latin-1', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [40]:
# Starting SVM
print('Starting SVM')
logging.info('Starting SVM')

# Defining Classifiers parameters
SVM_EN = LinearSVC(C=1,verbose=True)

print('Training Cluster 1 SVM')

# fit the classifier on train data
SVM_EN.fit(vectorizer.transform(en_train_data), en_train_label)


# predicting using classifier
y_pred_EN = SVM_EN.predict(vectorizer.transform(en_test_data))
print("Saving the predictions")

Starting SVM
Training Cluster 1 SVM
[LibLinear]Saving the predictions


In [41]:
print("Accuracy:",accuracy_score(en_test_label, y_pred_EN))
#logging.info("Accuracy:",str(accuracy_score(test_label, y_pred)))
classification_rep = classification_report(en_test_label, y_pred_EN)
print(classification_rep)
#logging.info("Classification Report:\n",str(classification_rep))
conf_matrix = confusion_matrix(en_test_label, y_pred_EN)
#logging.info('Confusion Matrix:\n',str(conf_matrix))
print(conf_matrix)

Accuracy: 0.8106617647058824
              precision    recall  f1-score   support

           0       0.95      0.78      0.86        23
           1       0.75      0.60      0.67         5
           2       0.86      0.86      0.86         7
           3       0.79      1.00      0.88        11
           4       0.80      0.80      0.80        25
           5       1.00      0.67      0.80         3
           6       1.00      0.80      0.89        10
           7       0.88      1.00      0.93        14
           8       0.93      1.00      0.96        27
           9       0.79      0.94      0.86        16
          10       0.75      0.92      0.83        26
          11       0.67      0.93      0.78        15
          12       0.67      0.67      0.67        12
          13       0.75      0.60      0.67        10
          14       0.77      0.86      0.81        35
          15       0.92      0.69      0.79        16
          16       1.00      0.78      0.88         