In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.model_selection import KFold 
import unicodedata




train_data_path = ''
train_data_df = pd.read_csv(os.path.join(train_data_path, 'train_data.csv'))

test_data_path = ''
test_data_df = pd.read_csv(os.path.join(test_data_path, 'test_data.csv'))

In [2]:
train_data_df

Unnamed: 0,language,text,label
0,dansk,"\nDette er et fremragende initiativ, og jeg st...",Ireland
1,dansk,"\nHr. formand, jeg er sikker på, at alle her e...",Ireland
2,dansk,"\nHr. formand, folk på den nordlige halvkugle ...",England
3,dansk,"\nHr. formand, med forbehold af nogle få ændri...",England
4,dansk,"\n\n - Hr. formand, jeg må protestere mod de...",England
...,...,...,...
41565,Nederlands,"\nMijnheer de Voorzitter, juridisch gezien is ...",England
41566,Nederlands,"\n\n . Mijnheer de Voorzitter, het is niet ...",Scotland
41567,Nederlands,"\nAls afgevaardigde van Ierland, het 'voedsele...",Ireland
41568,Nederlands,"\nMijnheer de Voorzitter, het is niet onterech...",England


In [3]:
test_data_df

Unnamed: 0,text
0,"\n\n Hr. formand, selv om vi i høj grad symp..."
1,\n\n Quiero dejar constancia de mi apoyo a e...
2,\n\n . – El comercio ilegal de riñones human...
3,"\nSignor Presidente, per introdurre una nota d..."
4,\nJeg stemte for meddelelsen af decharge til f...
...,...
13855,\nschriftlich. - Dieser Bericht handelt von wi...
13856,"\n\n Signor Presidente, desidero ringraziare..."
13857,\n. (EN) Ich unterstütze den Bericht von Herrn...
13858,"\n\n – Mijnheer de Voorzitter, ik ben verheu..."


In [4]:
etichete_unice = train_data_df['label'].unique()
print(etichete_unice)

label2id = {}
id2label = {}
for idx, eticheta in enumerate(etichete_unice):
    label2id[eticheta]=idx
    id2label[idx]=eticheta

print(label2id)
print(id2label)


['Ireland' 'England' 'Scotland']
{'Ireland': 0, 'England': 1, 'Scotland': 2}
{0: 'Ireland', 1: 'England', 2: 'Scotland'}


In [5]:
labels = []
for eticheta in train_data_df['label']:
    labels.append(label2id[eticheta])
labels=np.array(labels)

# Nu uita sa modificam inapoi
print(labels[:10])

[0 0 1 1 1 0 1 1 1 2]


In [6]:
labels = train_data_df['label'].apply(lambda etich: label2id[etich])
labels = labels.values

In [7]:
# luat in calcul cuvintele functionale, pot creste sau scadea acuratetea
# in cazul de fata, cuvintele functionale (stop words) indica elemente de gramatica
# care sunt specifice textelor de la Scotieni, Englezi, Irlandezi
import nltk
from nltk.corpus import stopwords

stopw=stopwords.words('danish')+stopwords.words('dutch')+stopwords.words('spanish')+stopwords.words('german')+stopwords.words('italian')


In [8]:
import re

def proceseaza(text):
    """Functie simpla de procesare a textului.
    Sugestii:
    - cum procesati \n new lines? (vezi functia strip())  ✔️
    - cum procesati empty token '' ✔️
    - puteti introduce un tokenizator din nltk ?✔️
    - puteti elimina sau pastra doar stop-words ?✔️
    """
    text = re.sub("[-.,;:!?\"\'\/()_*=~@#$^&`]", " ", text) #Eliminare caracere speciale I
    text = re.sub("[0-9+–%]", " ", text) # Eliminare caractere speciale II
    text = text.replace('\n',' ') # Replace de /n-uri
    pattern = r"((?<=^)|(?<= )).((?=$)|(?= ))" #Scotea caractere cu un singur caracter : a b c d etc... / Este Regexul
    text=re.sub("\s+", " ", re.sub(pattern, '', text).strip()) # Scoaterea efectiva
    text2=unicodedata.normalize("NFKD", text) #Normalizare pt eroare de un cuvant in most common
    text_in_cuvinte = text2.strip('  ').split(' ')
    text_in_cuvinte = list(filter(None, text_in_cuvinte))#Scoaterea de spatii goale
    text_in_cuvinte2=[]
    text_cuvinte=[]
    for i in  text_in_cuvinte:
            text_cuvinte.append(i.lower())
    for i in text_cuvinte:
        if i not in stopw :
            text_in_cuvinte2.append(i)
    return text_in_cuvinte2
  


data = train_data_df['text'].apply(proceseaza)



data2=test_data_df['text'].apply(proceseaza)


In [9]:
counter = Counter()
for text in data:
    counter.update(text)
print(counter.most_common(11))

[('på', 26094), ('für', 21854), ('eu', 17142), ('we', 15756), ('europa', 14991), ('presidente', 14734), ('europea', 13318), ('wij', 12340), ('parlamento', 11493), ('moeten', 10083), ('hr', 9797)]


In [10]:
print(len(data))
print(len(data2))

41570
13860


In [11]:
nr_test = int(10/100 * len(train_data_df))
print("Nr de date de test: ", nr_test)

nr_ramase = len(data) - nr_test
nr_valid = int(5/100 * nr_ramase)
print("Nr de date de validare: ", nr_valid)

nr_train = nr_ramase - nr_valid
print("Nr de date de antrenare: ", nr_train)

nr_rtest = int( len(test_data_df))
print("Nr de date de real-test: ", nr_rtest)

Nr de date de test:  4157
Nr de date de validare:  1870
Nr de date de antrenare:  35543
Nr de date de real-test:  13860


In [12]:
# luam niste indici de la 0 la N
indici = np.arange(0,len(train_data_df))
print(indici)
# ii permutam si apoi putem sa-i folosim pentru a amesteca datele
np.random.shuffle(indici)
print(indici)

[    0     1     2 ... 41567 41568 41569]
[21114 30040 18759 ...  9179 19405 10135]


In [13]:
train_data = data[indici[:nr_train]]
train_labels = labels[indici[:nr_train]]

valid_data = data[indici[nr_train : nr_train + nr_valid]]
valid_labels = labels[indici[nr_train : nr_train + nr_valid]]

test_data = data[indici[nr_train + nr_valid: ]]
test_labels = labels[indici[nr_train + nr_valid:]]

rtest_data = data2[:nr_rtest]



print(f'Nr de exemple de real test {len(rtest_data)}')
print(f'Nr de exemple de train {len(train_labels)}')
print(f'Nr de exemple de validare {len(valid_labels)}')
print(f'Nr de exemple de test {len(test_labels)}')

Nr de exemple de real test 13860
Nr de exemple de train 35543
Nr de exemple de validare 1870
Nr de exemple de test 4157


In [14]:
ctr = Counter(data[9]) # cele mai frecvente cuvinte din primul paragraf 
print(ctr)

Counter({'på': 3, 'støtter': 1, 'ordføreren': 1, 'foreslår': 1, 'parlamentet': 1, 'godkender': 1, 'aftalen': 1, 'langvarige': 1, 'tvister': 1, 'negativ': 1, 'indflydelse': 1, 'erhvervslivet': 1, 'slutforbrugere': 1, 'begge': 1, 'sider': 1, 'middelhavet': 1})


In [15]:
def count_most_common(how_many, texte_preprocesate):
    """Functie care returneaza cele mai frecvente cuvinte.
    """
    counter = Counter()
    #TODO:
    for text in texte_preprocesate:
        counter.update(text)
    cuvinte_caracteristice = []
    for cuvant, frecventa in counter.most_common(how_many):
        if cuvant.strip():
            cuvinte_caracteristice.append(cuvant)
    return cuvinte_caracteristice

In [16]:
def build_id_word_dicts(cuvinte_caracteristice):
    #Dictionarele word2id si id2word garanteaza o ordine pentru cuvintele caracteristice.
    word2id = {}
    id2word = {}
    for idx, cuv in enumerate(cuvinte_caracteristice):
        word2id[cuv] = idx
        id2word[idx] = cuv

    return word2id, id2word

In [17]:
def featurize(text_preprocesat, id2word):
    #Pentru un text preprocesat dat si un dictionar
    #care mapeaza pentru fiecare pozitie ce cuvant corespunde,
    #returneaza un vector care reprezinta
    #frecventele fiecarui cuvant.
    
    ctr = Counter(text_preprocesat)
    features = np.zeros(len(id2word))
    for idx in range(0, len(features)):
        cuvant = id2word[idx]
        features[idx] = ctr[cuvant]
    return features

In [18]:
def featurize_multi(texte, id2word):
    #Pentru un set de texte preprocesate si un dictionar
    #care mapeaza pentru fiecare pozitie ce cuvant corespunde,
    #returneaza matricea trasaturilor tuturor textelor.
    
    all_features = []
    for text in texte:
        all_features.append(featurize(text, id2word))
    return np.array(all_features)

In [19]:
cuvinte_caracteristice = count_most_common(3000,train_data)
# print(cuvinte_caracteristice)
word2id, id2word = build_id_word_dicts(cuvinte_caracteristice)

X_train = featurize_multi(train_data, id2word)
X_valid = featurize_multi(valid_data, id2word)
X_test = featurize_multi(test_data, id2word)
X_data=featurize_multi(data, id2word)
#procesarea datelor de real test
X_rtest= featurize_multi(rtest_data, id2word)

print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)
print(X_data.shape)

print(X_rtest.shape)



(35543, 3000)
(1870, 3000)
(4157, 3000)
(41570, 3000)
(13860, 3000)


In [20]:
# from sklearn.neighbors import KNeighborsClassifier
# neigh = KNeighborsClassifier(n_neighbors=5)
# neigh.fit(X_train, train_labels)
# vpreds = neigh.predict(X_valid)
# tpreds = neigh.predict(X_test)

# knpreds = neigh.predict(X_rtest)
# print(knpreds)

# print(accuracy_score(valid_labels, vpreds))
# print(accuracy_score(test_labels, tpreds))


In [21]:
# newlabel= []
# for  idx  in knpreds:
#     newlabel.append(id2label[idx])
# newlabel=np.array(newlabel)
# df = pd.DataFrame()
# df['id'] = test_data_df.index +1
# df['label'] =newlabel
# df.to_csv('submission_Kn.csv', index=False)

In [22]:
# from sklearn.naive_bayes import GaussianNB
# clf = GaussianNB()
# clf.fit(X_train, train_labels)

# vpreds = clf.predict(X_valid)
# tpreds = clf.predict(X_test)

# bpreds = clf.predict(X_rtest)
# print(bpreds)


# print(accuracy_score(valid_labels, vpreds))
# print(accuracy_score(test_labels, tpreds))

In [23]:
# newlabel= []
# for  idx  in bpreds:
#     newlabel.append(id2label[idx])
# newlabel=np.array(newlabel)

# df = pd.DataFrame()
# df['id'] = test_data_df.index +1
# df['label'] =newlabel
# df.to_csv('submission_bayes.csv', index=False)

In [24]:
# from sklearn import svm

# model = svm.LinearSVC(C=1)

# model.fit(X_train, train_labels)
# vpreds = model.predict(X_valid)
# tpreds = model.predict(X_test)
# trainpreds=model.predict(X_train)
# svmpreds=model.predict(X_rtest)

# print(accuracy_score(valid_labels, vpreds))
# print(accuracy_score(test_labels, tpreds))

# print(accuracy_score(train_labels,trainpreds))

In [25]:
# newlabel= []
# for  idx  in svmpreds:
#     newlabel.append(id2label[idx])
# newlabel=np.array(newlabel)
# df = pd.DataFrame()
# df['id'] = test_data_df.index +1
# df['label'] =newlabel
# df.to_csv('submission_svm.csv', index=False)

In [26]:
from sklearn.metrics import confusion_matrix

X = X_train
y =train_labels

k=5
kf = KFold(n_splits=k,random_state=None,shuffle=False)


model = svm.LinearSVC(C=1)
acc_score = []
for train_index, test_index in kf.split(X):  
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train,y_train)  
    pred_values1 = model.predict(X_test)   
    acc = accuracy_score(pred_values1 , y_test)
    print(confusion_matrix(pred_values1, y_test))
    acc_score.append(acc) 
    
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))


pred_values = model.predict(X_rtest)
print(pred_values)
    
    



[[ 956  398  107]
 [ 709 3175  774]
 [ 132  327  531]]




[[ 923  394  142]
 [ 748 3166  740]
 [ 146  323  527]]




[[ 951  376  129]
 [ 679 3176  778]
 [ 148  353  519]]




[[ 943  426  131]
 [ 674 3167  724]
 [ 172  330  541]]
[[ 889  335  130]
 [ 770 3171  776]
 [ 161  329  547]]
accuracy of each fold - [0.6557884371922914, 0.6493177662118441, 0.6535377690251793, 0.6543331457512662, 0.6481429375351716]
Avg accuracy : 0.6522240111431505
[1 1 1 ... 1 0 2]




In [27]:
newlabel= []
for  idx  in pred_values:
    newlabel.append(id2label[idx])
newlabel=np.array(newlabel)
df = pd.DataFrame()
df['id'] = test_data_df.index +1
df['label'] =newlabel
df.to_csv('submission_k_fold-200.v2.svm.csv', index=False)