In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [86]:
def read_dataset(path = 'datasets/train.csv'):
    df = pd.read_csv(path)
    print("Shape of the dataset: {}".format(df.shape))
    return df;
df = read_dataset()
df

Shape of the dataset: (3048, 3)


Unnamed: 0,id,query,label
0,0,شرایط حذف ترم چیه؟,1
1,1,از کجا می تونم با دکتر وحیدی ارتباط برقرار کنم؟,2
2,2,بوفه برداران تا ساعت چند باز است؟,2
3,3,کمترین تعداد واحد چند عدد است؟,1
4,4,سنگ جامد است,5
...,...,...,...
3043,3043,چند تا درس میشه حذف کرد,1
3044,3044,جدید ترین ویرایش کتاب هریس که موجوده چیه؟,3
3045,3045,شرایط مهمان شدن در دانشکده ما چیست؟,1
3046,3046,آمفی تئاتر دانشکده کامپیوتر کجاست؟,2


In [4]:
def cal_class_prob(d):
    classProb = d.groupby(['label']).size().reset_index(name='counts')['counts']/len(d)
    return classProb
class_prob = cal_class_prob(df)
class_prob

0    0.269029
1    0.233596
2    0.152559
3    0.190617
4    0.154199
Name: counts, dtype: float64

In [87]:
import re
import string
import math

#preprocessing
df_copy = df.copy()
#expand contractions
contractions_dict = {
    "کامپیوترا": "کامپیوتر ها",
    "کلاسا":"کلاس ها",
    "میتونم": "می تونم",
    "میتونیم":"می تونیم",
    "سرورای":"سرور های",
    "موجوده":"موجود هست",
    "چیه":"چی هست",
    "کدومه":"کدوم هست",
    "کجاست":"کجا هست",
    "چنده":"چند هست"
}

contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)


#delete punctuations
punctuations = '؟!.' + string.punctuation
def removePunctuations(x):
    return re.sub('[%s]' % re.escape(punctuations), '' , x)

#remove numbers
def removeNumbers(x):
    #x = re.sub('','',x)
    return re.sub('\W*\d\W*',' ',x)
def removeSpaces(x): 
    return re.sub('\s+',' ',x)

#delete common words
common_words = ['چی','هست','است','تا','از','را','به','بود','تو','ما','من','همه','های','ها','با']
def removeCommon(x):
    return ' '.join([word for word in x.split() if word not in common_words])

#remove outlier records with similiar query but different label 
def remove_outliers(df):
    def outliersList(g):
        g = g[np.abs(g.label - g.label.mean()) > (3 * g.label.std())]
        return g
    outliers = df_copy.groupby(['query']).apply(lambda g : outlierList(g))['id']
    return df.drop(outliers.values)
    

#remove duplicates and empty
def remove_dup(df):
    dup_mask = df['query'].duplicated()
    empty_mask = df['query'].str.len()>0
    df = df[~dup_mask & empty_mask]
    return df

def formatQuery(q):
    q = expand_contractions(q)
    q = removePunctuations(q)
    q = removeSpaces(removeNumbers(q))
    q = removeCommon(q)
    return q

def preProcess(df):
    df_copy = pd.DataFrame(df)
    df_copy['query'] = df_copy['query'].apply(lambda x : formatQuery(x))
    df_copy = remove_outliers(df_copy)
    df_copy = remove_dup(df_copy)
    return df_copy

print("dataset size before pre-processing : ",len(df_copy))
df_copy = preProcess(df_copy)
print("dataset size after pre-processing : ",len(df_copy))
df_copy

dataset size before pre-processing :  3048
dataset size after pre-processing :  2873


Unnamed: 0,id,query,label
0,0,شرایط حذف ترم,1
1,1,کجا می تونم دکتر وحیدی ارتباط برقرار کنم,2
2,2,بوفه برداران ساعت چند باز,2
3,3,کمترین تعداد واحد چند عدد,1
4,4,سنگ جامد,5
...,...,...,...
3043,3043,چند درس میشه حذف کرد,1
3044,3044,جدید ترین ویرایش کتاب هریس که موجود,3
3045,3045,شرایط مهمان شدن در دانشکده چیست,1
3046,3046,آمفی تئاتر دانشکده کامپیوتر کجا,2




# Naive Bayes classification



In [65]:
def get_words(df):
    queries = [q.split() for q in df['query']]
    words = [word 
                for query in queries
                    for word in query ]
    words = pd.Series(words).unique()
    return words
words = get_words(df_copy)
words

array(['شرایط', 'حذف', 'ترم', ..., 'استقلال', 'مس', 'ارکد'], dtype=object)

In [66]:
def get_tdm(df,words):
    freq_data = [[df['query'][i].split().count(word) for word in words] for i in df['id']]
    freq_df = pd.DataFrame(data = freq_data, columns = words , index = df['id'])
    return freq_df

freq_df = get_tdm(df_copy,words)
    
freq_df.head(10)

Unnamed: 0_level_0,شرایط,حذف,ترم,کجا,می,تونم,دکتر,وحیدی,ارتباط,برقرار,...,برمیگردن,خودشان,ولنجک,پاسداران,IoT,میچینه,منع,استقلال,مس,ارکد
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
#accumulated sum
def get_grouped_tdm(df):
    freq_df = df.copy()
    freq_df['label'] = df_copy['label']
    freq_df = freq_df.groupby(['label']).sum()
    return freq_df
gtdm = get_grouped_tdm(freq_df)

#smoothing
def apply_smoothing(df,alpha):
    df = df.apply(lambda x : x + alpha)
    df['count'] = df.apply(np.sum,axis = 1)
    return df
freq_df2 = apply_smoothing(gtdm,1)
counts = freq_df2['count']

freq_df2

Unnamed: 0_level_0,شرایط,حذف,ترم,کجا,می,تونم,دکتر,وحیدی,ارتباط,برقرار,...,خودشان,ولنجک,پاسداران,IoT,میچینه,منع,استقلال,مس,ارکد,count
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,36,134,152,56,126,37,7,1,1,2,...,1,1,1,1,2,2,1,1,1,7549
2,6,2,6,234,89,24,19,5,2,3,...,1,1,1,2,1,1,1,1,1,5663
3,5,1,1,26,63,25,2,1,1,1,...,1,1,1,1,1,1,1,1,2,5214
4,7,8,23,2,30,1,2,1,3,3,...,2,1,1,1,1,1,1,1,1,6277
5,2,2,4,39,43,12,3,1,1,1,...,1,2,2,1,1,1,2,2,1,4889


In [68]:
#calculating probability of each word in class i
def get_prob(df):
    counts = df['count']
    def cal_prob(x):
        return x/counts
    prob = df.apply(lambda x : cal_prob(x))
    prob = prob.drop(['count'],axis=1)
    return prob
prob_df = get_prob(freq_df2)
prob_df

Unnamed: 0_level_0,شرایط,حذف,ترم,کجا,می,تونم,دکتر,وحیدی,ارتباط,برقرار,...,برمیگردن,خودشان,ولنجک,پاسداران,IoT,میچینه,منع,استقلال,مس,ارکد
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.004769,0.017751,0.020135,0.007418,0.016691,0.004901,0.000927,0.000132,0.000132,0.000265,...,0.000132,0.000132,0.000132,0.000132,0.000132,0.000265,0.000265,0.000132,0.000132,0.000132
2,0.00106,0.000353,0.00106,0.041321,0.015716,0.004238,0.003355,0.000883,0.000353,0.00053,...,0.000353,0.000177,0.000177,0.000177,0.000353,0.000177,0.000177,0.000177,0.000177,0.000177
3,0.000959,0.000192,0.000192,0.004987,0.012083,0.004795,0.000384,0.000192,0.000192,0.000192,...,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000384
4,0.001115,0.001274,0.003664,0.000319,0.004779,0.000159,0.000319,0.000159,0.000478,0.000478,...,0.000159,0.000319,0.000159,0.000159,0.000159,0.000159,0.000159,0.000159,0.000159,0.000159
5,0.000409,0.000409,0.000818,0.007977,0.008795,0.002454,0.000614,0.000205,0.000205,0.000205,...,0.000205,0.000205,0.000409,0.000409,0.000205,0.000205,0.000205,0.000409,0.000409,0.000205


In [69]:
#predicting new queries
class Prediction():
    def __init__(self,prob,class_prob,counts):
        self.prob = prob
        self.class_prob = class_prob
        self.counts = counts

    def chance(self,query,classNo):
        res = 1
        words = query.split()
        for word in words:
            if word in self.prob.columns.values:
                res *= self.prob.loc[classNo,word]
            else:
                res *= 1/self.counts[classNo]   
        return res

    def predict(self,query):
        max = 0.0
        fit = -1
        for i in range(1,6):
            c = self.chance(query,i) * self.class_prob[i-1]
            if(c>max):
                max=c
                fit=i
        return fit


In [70]:
#train
def train(df):
    df = preProcess(df)
    class_prob = cal_class_prob(df)
    words = get_words(df)
    freq_df = get_tdm(df,words)
    gtdm = get_grouped_tdm(freq_df)
    freq_df2 = apply_smoothing(gtdm,1)
    counts = freq_df2['count']
    prob_df = get_prob(freq_df2)
    return prob_df,class_prob,counts


In [79]:
from sklearn.metrics import f1_score
#3-fold cross-validation
from sklearn.model_selection import train_test_split

def three_fold(df):
    sum = 0.0
    length = len(df)//3
    for i in range(3):
        mask = (df['id'] >= i * length ) * (df['id'] < (i+1) * length)
        X_test = pd.DataFrame(df[mask])
        X_train = pd.DataFrame(df[~mask])
        prob, class_prob,counts = train(X_train)
        pred = Prediction(prob,class_prob,counts)
        X_test['query'] = X_test['query'].apply(lambda x : formatQuery(x))
        X_test['pred_label'] = X_test['query'].apply(lambda x : pred.predict(x))
        f1 = f1_score(X_test['label'], X_test['pred_label'] , average='micro')
        print("iteration ",i, " f1-score : ",f1)
        sum += f1
    return sum/3

print("average f1_score is: ",three_fold(df))

iteration  0  f1-score :  0.78248031496063
iteration  1  f1-score :  0.7480314960629921
iteration  2  f1-score :  0.8080708661417323
average f1_score is:  0.7795275590551182


In [76]:
test_df = pd.read_csv('datasets/test.csv')
print("Shape of the dataset: {}".format(test_df.shape))

result = test_df.copy()
test_df['query'] = test_df['query'].apply(lambda x : formatQuery(x))
pred = Prediction(prob_df,class_prob,counts)
result['label'] = test_df['query'].apply(lambda s : pred.predict(s))

result

Shape of the dataset: (762, 2)


Unnamed: 0,id,query,label
0,0,چرا آخر ترم درس ها انقدر فشرده میشوند؟,4
1,1,فرجه این ترم چقدر است؟,1
2,2,صندلی های دانشگاه را ابری کنید!,4
3,3,محل تشکیل امتحان,2
4,4,دانشکده زیراکس دارد؟,2
...,...,...,...
757,757,آیا پنج شنبه ها دانشگاه تعطیله؟,2
758,758,آزمایشگاه شبکه کجاست؟,2
759,759,ترم تابستان از چه تاریخی آغاز میشود؟,1
760,760,آلودگی امروز چجوریه؟,5


In [77]:
#save results to csv
result.to_csv('result.csv',columns=['id','label'],index=False)
file = pd.read_csv('result.csv')
file.head(5)

Unnamed: 0,id,label
0,0,4
1,1,1
2,2,4
3,3,2
4,4,2


# KNN classification

In [81]:
def similarity_score(a,b):
    score = 0.01
    if a!=b and (len(a)<=2 or len(b)<=2):
        return score
    if a==b:
        score = 2
    elif a in b:
        score = len(a)/len(b)
    elif b in a:
        score = len(b)/len(a)
    else:
        s1 = a if len(a)<=len(b) else b
        s2 = a if len(a)>len(b) else b
        score = ((len(s1)-1)/len(s1)) * max(similarity_score(s1[:-1],s2) , similarity_score(s1[1:],s2))
    return score


In [82]:
from collections import Counter

class KNN():
    def __init__(self,k):
        self.k = k
    
    def cal_similarity(self,train_q,test_q):
        scores = [max([similarity_score(x,y) for y in train_q.split()]) for x in test_q.split()]
        return np.prod(scores)
    
    def predict(self,df,s):
        pred = pd.DataFrame(df)
        pred.loc[:,'score'] = pred['query'].apply(lambda x : self.cal_similarity(x,s))
        model = pred.sort_values('score',ascending=False).head(self.k)
        counter = Counter(model['label'].values)
        val, cnt = counter.most_common()[0]
        return val


In [83]:
df = read_dataset()
df = preProcess(df)
knn = KNN(5)

def knn_three_fold(df):
    sum = 0.0
    length = len(df)//3
    for i in range(3):
        mask = (df['id'] >= i * length ) * (df['id'] < (i+1) * length)
        X_test = pd.DataFrame(df[mask])
        X_train = pd.DataFrame(df[~mask])
        
        X_test['query'] = X_test['query'].apply(lambda x : formatQuery(x))
        X_test['pred_label'] = X_test['query'].apply(lambda x : knn.predict(X_train,x))
        
        f1 = f1_score(X_test['label'], X_test['pred_label'] , average='micro')
        print("iteration ",i, " f1-score : ",f1)
        sum += f1
    return sum/3
knn_avg_f1score = knn_three_fold(df)
knn_avg_f1score

Shape of the dataset: (3048, 3)
iteration  0  f1-score :  0.723175965665236
iteration  1  f1-score :  0.6714922048997772
iteration  2  f1-score :  0.7191392978482446


0.704602489471086

In [11]:
df = read_dataset()
df = preProcess(df)
df.head(5)

Shape of the dataset: (3048, 3)


Unnamed: 0,id,query,label
0,0,شرایط حذف ترم,1
1,1,کجا می تونم دکتر وحیدی ارتباط برقرار کنم,2
2,2,بوفه برداران ساعت چند باز,2
3,3,کمترین تعداد واحد چند عدد,1
4,4,سنگ جامد,5


In [12]:
#knn results
knn = KNN(5)
knn_test_df = pd.read_csv('datasets/test.csv')

knn_result = pd.DataFrame(knn_test_df)
knn_test_df['query'] = knn_test_df['query'].apply(lambda x : formatQuery(x))

knn_result['label'] = knn_test_df['query'].apply(lambda s : knn.predict(df,s))
knn_result

Unnamed: 0,id,query,label
0,0,چرا آخر ترم درس انقدر فشرده میشوند,4
1,1,فرجه این ترم چقدر,1
2,2,صندلی دانشگاه ابری کنید,4
3,3,محل تشکیل امتحان,4
4,4,دانشکده زیراکس دارد,3
...,...,...,...
757,757,آیا پنج شنبه دانشگاه تعطیله,2
758,758,آزمایشگاه شبکه کجا,2
759,759,ترم تابستان چه تاریخی آغاز میشود,1
760,760,آلودگی امروز چجوریه,5


In [14]:
knn_result.to_csv('knn_result.csv',columns=['id','label'],index=False)
knn_result.to_csv('knn_result.csv',index=False)