In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def read_dataset(path = 'datasets/train.csv'):
    df = pd.read_csv(path)
    print("Shape of the dataset: {}".format(df.shape))
    return df;
df = read_dataset()
df.head(10)

Shape of the dataset: (3048, 3)


Unnamed: 0,id,query,label
0,0,شرایط حذف ترم چیه؟,1
1,1,از کجا می تونم با دکتر وحیدی ارتباط برقرار کنم؟,2
2,2,بوفه برداران تا ساعت چند باز است؟,2
3,3,کمترین تعداد واحد چند عدد است؟,1
4,4,سنگ جامد است,5
5,5,سرورای دانشکده مشکل دارن؟,3
6,6,کلاس آزمایشگاه فیزیک در دانشکده خودمان برگزار ...,2
7,7,شرایط حذف پزشکی چیه؟,1
8,8,در شرایطی ساعت و روز کلاسی جابجا می شود؟,1
9,9,سطل آشغال در کلاس 101 وجود ندارد.,4


In [3]:
def cal_class_prob(d):
    classProb = d.groupby(['label']).size().reset_index(name='counts')['counts']/len(d)
    return classProb
classProb = cal_class_prob(df)
classProb

0    0.269029
1    0.233596
2    0.152559
3    0.190617
4    0.154199
Name: counts, dtype: float64

In [4]:
import re
import string
import math

#preprocessing
df_copy = df.copy()
#expand contractions
contractions_dict = {
    "کامپیوترا": "کامپیوتر ها",
    "کلاسا":"کلاس ها",
    "میتونم": "میتوانم",
    "سرورای":"سرور های",
    "موجوده":"موجود هست",
    "چیه":"چی هست",
    "کدومه":"کدوم هست",
    "کجاست":"کجا هست",
    "چنده":"چند هست"
}

contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)


#delete punctuations
punctuations = '؟!.' + string.punctuation
def removePunctuations(x):
    return re.sub('[%s]' % re.escape(punctuations), '' , x)

#remove numbers
def removeNumbers(x):
    #x = re.sub('','',x)
    return re.sub('\W*\d\W*',' ',x)
def removeSpaces(x): 
    return re.sub('\s+',' ',x)

#delete common words
common_words = ['چی','هست','است','تا','از','را','به','بود','تو','ما','من','همه','های','ها']
def removeCommon(x):
    return ' '.join([word for word in x.split() if word not in common_words])

#remove duplicates
def remove_dup(df):
    df = df[~df['query'].duplicated()]
    return df

def formatQuery(q):
    q = expand_contractions(q)
    q = removePunctuations(q)
    q = removeSpaces(removeNumbers(q))
    q = removeCommon(q)
    return q

def preProcess(df):
    df_copy = pd.DataFrame(df)
    df_copy['query'] = df_copy['query'].apply(lambda x : formatQuery(x))
    df_copy = remove_dup(df)
    return df_copy

df_copy = preProcess(df_copy)
df_copy

Unnamed: 0,id,query,label
0,0,شرایط حذف ترم,1
1,1,کجا می تونم با دکتر وحیدی ارتباط برقرار کنم,2
2,2,بوفه برداران ساعت چند باز,2
3,3,کمترین تعداد واحد چند عدد,1
4,4,سنگ جامد,5
...,...,...,...
3043,3043,چند درس میشه حذف کرد,1
3044,3044,جدید ترین ویرایش کتاب هریس که موجود,3
3045,3045,شرایط مهمان شدن در دانشکده چیست,1
3046,3046,آمفی تئاتر دانشکده کامپیوتر کجا,2




# Naive Bayes classification



In [5]:
def get_words(df):
    queries = [q.split() for q in df['query']]
    words = [word 
                for query in queries
                    for word in query ]
    words = pd.Series(words).unique()
    return words
words = get_words(df_copy)
words

array(['شرایط', 'حذف', 'ترم', ..., 'استقلال', 'مس', 'ارکد'], dtype=object)

In [7]:
def get_tdm(df,words):
    freq_data = [[df['query'][i].split().count(word) for word in words] for i in df['id']]
    freq_df = pd.DataFrame(data = freq_data, columns = words , index = df['id'])
    return freq_df

freq_df = get_tdm(df_copy,words)
    
freq_df.head(10)

Unnamed: 0_level_0,شرایط,حذف,ترم,کجا,می,تونم,با,دکتر,وحیدی,ارتباط,...,برمیگردن,خودشان,ولنجک,پاسداران,IoT,میچینه,منع,استقلال,مس,ارکد
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#accumulated sum
def get_grouped_tdm(df):
    freq_df = df.copy()
    freq_df['label'] = df_copy['label']
    freq_df = freq_df.groupby(['label']).sum()
    return freq_df
gtdm = get_grouped_tdm(freq_df)

#smoothing
def apply_smoothing(df,alpha):
    df = df.apply(lambda x : x + alpha)
    df['count'] = df.apply(np.sum,axis = 1)
    return df
freq_df2 = apply_smoothing(gtdm,1)
freq_df2

Unnamed: 0_level_0,شرایط,حذف,ترم,کجا,می,تونم,با,دکتر,وحیدی,ارتباط,...,خودشان,ولنجک,پاسداران,IoT,میچینه,منع,استقلال,مس,ارکد,count
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,36,134,152,56,97,10,25,7,1,1,...,1,1,1,1,2,2,1,1,1,7547
2,6,2,6,233,76,13,13,19,5,2,...,1,1,1,2,1,1,1,1,1,5661
3,5,1,1,26,39,5,10,2,1,1,...,1,1,1,1,1,1,1,1,2,5201
4,7,8,23,2,30,1,28,2,1,3,...,2,1,1,1,1,1,1,1,1,6308
5,2,2,4,39,29,2,11,3,1,1,...,1,2,2,1,1,1,2,2,1,4889


In [10]:
#calculating probability of each word in class i
def get_prob(df):
    counts = df['count']
    def cal_prob(x):
        return x/counts
    prob = df.apply(lambda x : cal_prob(x))
    prob = prob.drop(['count'],axis=1)
    return prob
prob_df = get_prob(freq_df2)
prob_df

Unnamed: 0_level_0,شرایط,حذف,ترم,کجا,می,تونم,با,دکتر,وحیدی,ارتباط,...,برمیگردن,خودشان,ولنجک,پاسداران,IoT,میچینه,منع,استقلال,مس,ارکد
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.00477,0.017755,0.02014,0.00742,0.012853,0.001325,0.003313,0.000928,0.000133,0.000133,...,0.000133,0.000133,0.000133,0.000133,0.000133,0.000265,0.000265,0.000133,0.000133,0.000133
2,0.00106,0.000353,0.00106,0.041159,0.013425,0.002296,0.002296,0.003356,0.000883,0.000353,...,0.000353,0.000177,0.000177,0.000177,0.000353,0.000177,0.000177,0.000177,0.000177,0.000177
3,0.000961,0.000192,0.000192,0.004999,0.007499,0.000961,0.001923,0.000385,0.000192,0.000192,...,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000385
4,0.00111,0.001268,0.003646,0.000317,0.004756,0.000159,0.004439,0.000317,0.000159,0.000476,...,0.000159,0.000317,0.000159,0.000159,0.000159,0.000159,0.000159,0.000159,0.000159,0.000159
5,0.000409,0.000409,0.000818,0.007977,0.005932,0.000409,0.00225,0.000614,0.000205,0.000205,...,0.000205,0.000205,0.000409,0.000409,0.000205,0.000205,0.000205,0.000409,0.000409,0.000205


In [12]:
#predicting new queries
class Prediction():
    def __init__(self,prob,class_prob,counts):
        self.prob = prob
        self.class_prob = class_prob
        self.counts = counts

    def chance(self,query,classNo):
        res = 1
        words = query.split()
        for word in words:
            if word in self.prob.columns.values:
                res *= self.prob.loc[classNo,word]
            else:
                res *= 1/counts[classNo]   
        return res

    def predict(self,query):
        max = 0.0
        fit = -1
        for i in range(1,6):
            c = self.chance(query,i) * self.class_prob[i-1]
            if(c>max):
                max=c
                fit=i
        return fit


In [13]:
#train
def train(df):
    df = preProcess(df)
    class_prob = cal_class_prob(df)
    words = get_words(df)
    freq_df = get_tdm(df,words)
    gtdm = get_grouped_tdm(freq_df)
    freq_df2 = apply_smoothing(gtdm,1)
    counts = freq_df2['count']
    prob_df = get_prob(freq_df2)
    return prob_df,class_prob,counts


In [14]:
from sklearn.metrics import f1_score
#3-fold cross-validation
from sklearn.model_selection import train_test_split

sum = 0.0
length = len(df)//3
for i in range(3):
    mask = (df['id'] >= i * length ) * (df['id'] < (i+1) * length)
    X_test = pd.DataFrame(df[mask])
    X_train = pd.DataFrame(df[~mask])
    prob, class_prob,counts = train(X_train)
    pred = Prediction(prob,class_prob,counts)
    X_test['query'] = X_test['query'].apply(lambda x : formatQuery(x))
    X_test['pred_label'] = X_test['query'].apply(lambda x : pred.predict(x))
    f1 = f1_score(X_test['label'], X_test['pred_label'] , average='micro')
    print("iteration ",i, " f1-score : ",f1)
    sum += f1

print("average f1_score is: ",sum/3)

iteration  0  f1-score :  0.78248031496063
iteration  1  f1-score :  0.750984251968504
iteration  2  f1-score :  0.8070866141732284
average f1_score is:  0.7801837270341209


In [20]:
test_df = pd.read_csv('datasets/test.csv')
print("Shape of the dataset: {}".format(test_df.shape))

#test_copy = test_df.copy()
test_df['query'] = test_df['query'].apply(lambda x : formatQuery(x))

pred = Prediction(prob_df,class_prob,counts)
result = pd.DataFrame(test_df)
result['label'] = test_df['query'].apply(lambda s : pred.predict(s))

result.head(10)

Shape of the dataset: (762, 2)


Unnamed: 0,id,query,label
0,0,چرا آخر ترم درس انقدر فشرده میشوند,4
1,1,فرجه این ترم چقدر,1
2,2,صندلی دانشگاه ابری کنید,4
3,3,محل تشکیل امتحان,2
4,4,دانشکده زیراکس دارد,2
5,5,اتاق اساتید در کدام طبقه,2
6,6,خوابگاه هنوز باز نشده اند,2
7,7,حجم تمرین فلان درس خیلی بالاست,4
8,8,انتخاب واحد کی,1
9,9,چجوری میتوانم برم آموزش دانشکده,2


In [22]:
#save results to csv
result.to_csv('result.csv',columns=['id','label'],index=False)
file = pd.read_csv('result.csv')
file.head(5)

Unnamed: 0,id,label
0,0,4
1,1,1
2,2,4
3,3,2
4,4,2
