#### Task 4
### Decision Tree, KNN
- [x] Использовать деревья решений в задаче классификации (sklearn)
- [x] Использовать KNN в задаче классификации (sklearn)

*Davletyarov Ildar, 11-808*

In [6]:
import nltk

# stab for download nltk packages without SSL
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /Users/ildar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
from nltk.stem import PorterStemmer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from collections import defaultdict
import string
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.probability import FreqDist
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to /Users/ildar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ildar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# load data
DATA_PATH = "/Users/ildar/Desktop/labeledTrainData.tsv"
df = pd.read_csv(DATA_PATH, sep='\t', encoding='utf-8')

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


In [9]:
df.to_dict('records')[0]

{'id': '5814_8',
 'sentiment': 1,
 'review': "With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />Th

In [10]:
COUNT_WORDS = 1000
texts = df.review.tolist()
sentiment = df.sentiment.tolist()
texts = texts[:COUNT_WORDS]

In [11]:
def delete_html(text):
    return text.replace('<br />', '')

def transform_text(text, spec_chars, lemmatizer):
    frequency = {}
    w = []
    text = text.lower() # to lower
    word_list = nltk.word_tokenize(text) # tokenize
    text = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) #lemmatize
    text = "".join([ch for ch in text if ch not in spec_chars]) # clear pointless chars
    text = delete_html(text) # delete tags
    fdist = FreqDist(text)
    freq = fdist.most_common(15) # delete non-words
    for j in range(15):
        if freq[j][1] >= 4:
            w.append(freq[j][0])
    if len(w) > 0:
        for k in range(len(w)):
            text = text.replace(w[k], '')
    word_list = nltk.word_tokenize(text)
    text = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    for word in text:
        count = frequency.get(word,0)
        frequency[word] = count + 1
    for word in frequency.keys():
        text = text.replace(word, str(frequency[word]))
    return text

In [12]:
class NaiveBayesClassifier:
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.class_probs = {}
        self.conditional_probabilities = {}
        self._class_freq = defaultdict(lambda: 0)
        self._feat_freq = defaultdict(lambda: 0)
    
    def fit(self, X, y):
        for feature, label in zip(X, y):
            self._class_freq[label] += 1
            for value in feature:
                self._feat_freq[(value, label)] += 1
        
        num_samples = len(X)
        for k in self._class_freq:
            self._class_freq[k] /= num_samples
            
        for value, label in self._feat_freq:
            self._feat_freq[(value, label)] /= self._class_freq[label]
            
        return self       
        
    def predict(self, X):
        return min(self._class_freq.keys(), key=lambda c : self._calculate_class_freq(X, c))
    
    def _calculate_class_freq(self, X, clss): # naive bayes ln of truth
        freq = -np.log(self._class_freq[clss])
        for feat in X:
            freq += -np.log(self._feat_freq.get((feat, clss), 10 ** (-7)))
        return freq

In [13]:
spec_chars = string.punctuation + '\n\t'
lemmatizer = WordNetLemmatizer()

for i in range(len(texts)):
    texts[i] = '0.' + transform_text(texts[i], spec_chars, lemmatizer)
    texts[i] = [float(texts[i])]

In [14]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [15]:
X = texts
Y = sentiment[:COUNT_WORDS]

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

In [17]:
# KNN DEPTH 10; NEIGHBORS 3
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
tree = DecisionTreeClassifier(max_depth = 10)
tree_simple = tree.fit(X_train, Y_train)
pred_proba = tree_simple.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test, pred_proba)
auc = metrics.roc_auc_score(Y_test, pred_proba)
predictions = tree_simple.predict(X_test)
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.54      0.59      0.56       158
           1       0.49      0.43      0.46       142

    accuracy                           0.52       300
   macro avg       0.51      0.51      0.51       300
weighted avg       0.51      0.52      0.51       300



In [18]:
from sklearn.neighbors import KNeighborsRegressor
KNN_model = KNeighborsRegressor(n_neighbors=3).fit(X_train,Y_train)

In [19]:
KNN_predict = KNN_model.predict(X_test)
pred_proba = tree_simple.predict_proba(X_test)[::,1]
auc = metrics.roc_auc_score(Y_test, pred_proba)

In [20]:
print(auc)

0.5277010162239258
