In [338]:

import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from scipy.sparse import csr_matrix

%matplotlib inline



In [339]:
# Data conversion
def getdata(file):
    rawdata = open(file,"r")

    messages = []
    intent = []
    for line in rawdata.readlines(): 
        sentence = line.split("\t")

        actual_words = sentence[0].split(" ")
        encoded_words = sentence[1].split(" ")

        for index, word in enumerate(encoded_words):
            if word == "O":
                encoded_words[index] = actual_words[index]

        msg = " ".join(encoded_words[1:-1])
        label = encoded_words[-1][0:-1]

        messages.append(msg)
        intent.append(label)

    data = pd.DataFrame(data={'message':messages,'intent':intent})
    return data

In [340]:
train = getdata("atis-2.train.w-intent.iob.txt")
test = getdata("atis.test.w-intent.iob.txt")


In [341]:
test.head()

Unnamed: 0,message,intent
0,i would like to find a flight from B-fromloc.c...,atis_flight
1,on B-depart_date.month_name B-depart_date.day_...,atis_airfare
2,on B-depart_date.month_name B-depart_date.day_...,atis_flight
3,i would like a flight traveling B-round_trip I...,atis_flight
4,i would like a flight from B-fromloc.city_name...,atis_flight


In [342]:
train.groupby('intent')['message'].nunique()

intent
atis_abbreviation                             56
atis_aircraft                                 67
atis_aircraft#atis_flight#atis_flight_no       1
atis_airfare                                 321
atis_airline                                 109
atis_airline#atis_flight_no                    2
atis_airport                                  16
atis_capacity                                 13
atis_cheapest                                  1
atis_city                                     17
atis_distance                                 16
atis_flight                                 2567
atis_flight#atis_airfare                      11
atis_flight_no                                12
atis_flight_time                              40
atis_ground_fare                              14
atis_ground_service                          177
atis_ground_service#atis_ground_fare           1
atis_meal                                      6
atis_quantity                                 37
atis_restrict

In [343]:

## Clean Data
stops = set(stopwords.words("english"))
def cleandata(text, lowercase = False, remove_stops = False, stemming = False,lemmatize=False):
    txt = str(text)
   
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        txt = " ".join([lemmatizer.lemmatize(w, pos='v') for w in txt.split()])
        
    return txt

In [344]:

## Clean data 
trainClean = train['message'].map(lambda x: cleandata(x, lowercase=True,remove_stops=True, stemming=True, lemmatize=True))
testClean = test['message'].map(lambda x: cleandata(x, lowercase=True,remove_stops=True, stemming=True, lemmatize =True))

# Feature extraction
vectorizer = TfidfVectorizer(analyzer='word', min_df=0.0, max_df=1.0,max_features=1024, ngram_range=(1,2))
vec = vectorizer.fit(trainClean)

X_train = vec.transform(trainClean)
X_test = vec.transform(testClean)
y_train = train['intent']
y_test = test['intent']

In [345]:

neigh = KNeighborsClassifier(n_neighbors=5, weights="distance", p=2)
neigh_train = neigh.fit(X_train, y_train) 
y_pred = neigh_train.predict(X_test)

print("Multi-class accuracy:",accuracy_score(y_test, y_pred),"\n")
print(classification_report(y_test, y_pred))

Multi-class accuracy: 0.9025755879059351 

                             precision    recall  f1-score   support

          atis_abbreviation       0.77      1.00      0.87        33
              atis_aircraft       1.00      0.67      0.80         9
               atis_airfare       0.94      0.65      0.77        48
   atis_airfare#atis_flight       0.00      0.00      0.00         1
               atis_airline       1.00      0.87      0.93        38
               atis_airport       1.00      0.56      0.71        18
              atis_capacity       1.00      0.90      0.95        21
                  atis_city       0.60      0.50      0.55         6
              atis_day_name       0.00      0.00      0.00         2
              atis_distance       0.57      0.40      0.47        10
                atis_flight       0.92      0.98      0.95       632
   atis_flight#atis_airfare       0.00      0.00      0.00        12
   atis_flight#atis_airline       0.00      0.00      0.00 

  'precision', 'predicted', average, warn_for)


In [346]:
clf = GaussianNB()
clf.fit(X_train.toarray(),y_train)
y_pred = clf.predict(X_test.toarray())

print("Multi-class accuracy:",accuracy_score(y_test, y_pred),"\n")
print(classification_report(y_test, y_pred))

Multi-class accuracy: 0.5711086226203808 

                             precision    recall  f1-score   support

          atis_abbreviation       0.55      0.79      0.65        33
              atis_aircraft       0.37      0.78      0.50         9
               atis_airfare       0.37      0.69      0.48        48
   atis_airfare#atis_flight       0.00      0.00      0.00         1
               atis_airline       0.50      0.18      0.27        38
atis_airline#atis_flight_no       0.00      0.00      0.00         0
               atis_airport       0.80      0.44      0.57        18
              atis_capacity       0.83      0.90      0.86        21
                  atis_city       0.60      0.50      0.55         6
              atis_day_name       0.00      0.00      0.00         2
              atis_distance       1.00      0.60      0.75        10
                atis_flight       0.89      0.56      0.69       632
   atis_flight#atis_airfare       0.00      0.00      0.00 

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [347]:
clf = SVC(kernel="linear", C=10)

clf.fit(X_train.toarray(),y_train)
y_pred = clf.predict(X_test.toarray())

print("Multi-class accuracy:",accuracy_score(y_test, y_pred),"\n")
print(classification_report(y_test, y_pred))

Multi-class accuracy: 0.9473684210526315 

                             precision    recall  f1-score   support

          atis_abbreviation       0.97      1.00      0.99        33
              atis_aircraft       0.69      1.00      0.82         9
               atis_airfare       0.91      1.00      0.95        48
   atis_airfare#atis_flight       0.00      0.00      0.00         1
               atis_airline       0.97      1.00      0.99        38
               atis_airport       0.94      0.89      0.91        18
              atis_capacity       1.00      0.95      0.98        21
                  atis_city       0.60      0.50      0.55         6
              atis_day_name       0.00      0.00      0.00         2
              atis_distance       1.00      0.40      0.57        10
                atis_flight       0.96      0.98      0.97       632
   atis_flight#atis_airfare       1.00      0.33      0.50        12
   atis_flight#atis_airline       0.00      0.00      0.00 

  'precision', 'predicted', average, warn_for)


In [348]:
clf = ExtraTreesClassifier(n_estimators=200)
clf.fit(X_train.toarray(),y_train)
y_pred = clf.predict(X_test.toarray())

print("Multi-class accuracy:",accuracy_score(y_test, y_pred),"\n")
print(classification_report(y_test, y_pred))

Multi-class accuracy: 0.93505039193729 

                             precision    recall  f1-score   support

          atis_abbreviation       1.00      0.97      0.98        33
              atis_aircraft       0.60      1.00      0.75         9
               atis_airfare       0.81      0.98      0.89        48
   atis_airfare#atis_flight       0.00      0.00      0.00         1
               atis_airline       1.00      0.97      0.99        38
               atis_airport       0.94      0.94      0.94        18
              atis_capacity       1.00      0.86      0.92        21
                  atis_city       0.50      0.83      0.62         6
              atis_day_name       0.00      0.00      0.00         2
              atis_distance       1.00      0.30      0.46        10
                atis_flight       0.97      0.98      0.97       632
   atis_flight#atis_airfare       0.00      0.00      0.00        12
   atis_flight#atis_airline       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


In [349]:
clf = RandomForestClassifier(n_estimators=200)

clf.fit(X_train.toarray(),y_train)
y_pred = clf.predict(X_test.toarray())

print("Multi-class accuracy:",accuracy_score(y_test, y_pred),"\n")
print(classification_report(y_test, y_pred))

Multi-class accuracy: 0.9361702127659575 

                             precision    recall  f1-score   support

          atis_abbreviation       0.94      1.00      0.97        33
              atis_aircraft       0.73      0.89      0.80         9
               atis_airfare       0.82      0.96      0.88        48
   atis_airfare#atis_flight       0.00      0.00      0.00         1
               atis_airline       1.00      1.00      1.00        38
               atis_airport       0.94      0.83      0.88        18
              atis_capacity       1.00      0.90      0.95        21
                  atis_city       0.50      0.83      0.62         6
              atis_day_name       0.00      0.00      0.00         2
              atis_distance       1.00      0.10      0.18        10
                atis_flight       0.96      0.99      0.97       632
   atis_flight#atis_airfare       0.00      0.00      0.00        12
   atis_flight#atis_airline       0.00      0.00      0.00 

  'precision', 'predicted', average, warn_for)


In [350]:
Models = [/SVC(kernel="linear", C=10),RandomForestClassifier(n_estimators=200),ExtraTreesClassifier(n_estimators=200),GaussianNB(),KNeighborsClassifier(n_neighbors=5, weights="distance", p=2)]
#create table to compare Model metric
Models_columns = ['Model Name', 'Accuracy score']
Models_compare = pd.DataFrame(columns = Models_columns)
row_index = 0
for alg in Models:

    #set name and parameters
    Models_name = alg.__class__.__name__
    Models_compare.loc[row_index, 'Model Name'] = Models_name
   #score model with cross validation: 
    alg.fit(X_train.toarray(),y_train)
    y_pred = alg.predict(X_test.toarray())
    Models_compare.loc[row_index, 'Accuracy score'] = accuracy_score(y_test,y_pred)  
    row_index+=1

In [351]:
Models_compare.sort_values(['Accuracy score'])

Unnamed: 0,Model Name,Accuracy score
3,GaussianNB,0.571109
4,KNeighborsClassifier,0.902576
1,RandomForestClassifier,0.924972
2,ExtraTreesClassifier,0.93841
0,SVC,0.947368


In [352]:

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=7)

clf = SVC(kernel="linear", C=10)
Multi_class_accuracy=[]
for train_index, test_index in skf.split(X_train, y_train):
    X_train_k, X_test_k = X_train[train_index], X_train[test_index]
    y_train_k, y_test_k = train["intent"][train_index], train["intent"][test_index]
    
    clf.fit(X_train_k,y_train_k)
    y_pred = clf.predict(X_test_k)
    print("Multi-class accuracy:",accuracy_score(y_test_k, y_pred),"\n")
    Multi_class_accuracy.append(accuracy_score(y_test_k, y_pred))



Multi-class accuracy: 0.963020030816641 

Multi-class accuracy: 0.967391304347826 

Multi-class accuracy: 0.9580093312597201 

Multi-class accuracy: 0.96875 

Multi-class accuracy: 0.9827586206896551 

Multi-class accuracy: 0.973186119873817 

Multi-class accuracy: 0.9793650793650793 



In [353]:
max(Multi_class_accuracy)

0.9827586206896551