In [1]:
import pandas as pd
import numpy as np
import os
import json
import csv
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

In [5]:
path = "/home/jackalhan/Development/app_data/tf_idf_hiarchical"
dficf_path = os.path.join(path, 'dficf')
tfidf_path = os.path.join(path, 'tfidf')

In [68]:
## pandas style
# dficf_train_df = pd.read_csv(os.path.join(dficf_path, 'dficf_train_all_in_one.csv'), skipinitialspace=True, 
#                              converters={'vector':lambda x: json.loads(x)})
# dficf_test_df = pd.read_csv(os.path.join(dficf_path, 'dficf_test_all_in_one.csv'))

# tfidf_train_df = pd.read_csv(os.path.join(tfidf_path, 'tfidf_train_all_in_one.csv'))
# tfidf_test_df = pd.read_csv(os.path.join(tfidf_path, 'tfidf_test_all_in_one.csv'))

In [6]:
#DFICF Train
with open(os.path.join(dficf_path, 'dficf_train_all_in_one.csv'), 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    header = next(reader)
    _X_train = []
    _y_train = []
    for x in reader:
        _X_train.append(json.loads(x[1]))
        _y_train.append(x[2])
    df_X_train = np.array(_X_train)
    df_y_train = np.array(_y_train).reshape(len(_y_train),1)
    del _X_train
    del _y_train
    
#DFICF Test
with open(os.path.join(dficf_path, 'dficf_test_all_in_one.csv'), 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    header = next(reader)
    # ------------------------------------------------
    # reader needs to be sorted by actual index value in order to group same documents 
    # ------------------------------------------------
    sorted_reader = sorted(reader, key=lambda row: int(row[4]))
    
    _X_test = []
    _y_test = []
    _potential_category_test = []
    _actual_index_test = []
    for x in sorted_reader:
        _X_test.append(json.loads(x[1]))
        _potential_category_test.append(x[3])
        _actual_index_test.append(int(x[4]))
        _y_test.append(x[5])
    
    df_X_test = np.array(_X_test)
    df_y_test = np.array(_y_test).reshape(len(_y_test),1)
    df_potential_category_test = np.array(_potential_category_test).reshape(len(_potential_category_test),1)
    df_actual_index_test = np.array(_actual_index_test).reshape(len(_actual_index_test),1)
    ## ----------------------------------------------------
    # Actual_index is going to be used to group them so that each index can be 
    # trained to get
    # the best result and compare it with the actual y_test
    ## ----------------------------------------------------
    del _X_test
    del _y_test
    del _potential_category_test
    del _actual_index_test
    
#TFIDF Train
with open(os.path.join(tfidf_path, 'tfidf_train_all_in_one.csv'), 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    header = next(reader)
    _X_train = []
    _y_train = []
    for x in reader:
        _X_train.append(json.loads(x[1]))
        _y_train.append(x[2])
    tf_X_train = np.array(_X_train)
    tf_y_train = np.array(_y_train).reshape(len(_y_train),1)
    del _X_train
    del _y_train
        
#TFIDF Test
with open(os.path.join(tfidf_path, 'tfidf_test_all_in_one.csv'), 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    header = next(reader)
    _X_test = []
    _y_test = []
    for x in reader:
        _X_test.append(json.loads(x[1]))
        _y_test.append(x[2])
    tf_X_test = np.array(_X_test)
    tf_y_test = np.array(_y_test).reshape(len(_y_test),1)
    del _X_test
    del _y_test

## Preprocessing for TFIDF

In [7]:
# TFIDF
tf_y_unique_labels = np.unique(tf_y_train)
tf_le = preprocessing.LabelEncoder()
tf_le.fit(tf_y_unique_labels)

LabelEncoder()

In [8]:
tf_y_train_encoded = tf_le.transform(tf_y_train)

  y = column_or_1d(y, warn=True)


In [9]:
tf_y_test_encoded = tf_le.transform(tf_y_test)

  y = column_or_1d(y, warn=True)


## Preprocessing for DFICF

In [10]:
# TFIDF
df_y_unique_labels = np.unique(df_y_train)
df_le = preprocessing.LabelEncoder()
df_le.fit(df_y_unique_labels)

LabelEncoder()

In [11]:
df_y_train_encoded = df_le.transform(df_y_train)

  y = column_or_1d(y, warn=True)


In [12]:
df_y_test_encoded = df_le.transform(df_y_test)

  y = column_or_1d(y, warn=True)


# K-NearestNeighbours Classifier

In [13]:
clf = CalibratedClassifierCV(base_estimator=LinearSVC(penalty='l2', dual=False), cv=5)

## TFIDF K-NN Processing

In [14]:
clf.fit(tf_X_train, tf_y_train_encoded)

CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=5, method='sigmoid')

In [15]:
#KNN Labels
clf.classes_

array([0, 1, 2, 3, 4])

In [16]:
#KNN Encoded Labels
tf_y_unique_labels

array(['business', 'entertainment', 'politics', 'sport', 'tech'], 
      dtype='<U13')

In [17]:
correct_predictions = 0
wrong_predictions = 0
for indx, _ in enumerate(tf_X_test):
    tf_X_sample_record = _.reshape(1, _.shape[0])
    #sample_indx = 0
    #tf_X_sample = tf_X_test[sample_indx].reshape(1, tf_X_test[sample_indx].shape[0])
    predicted = clf.predict_proba(tf_X_sample_record)
    print('Actual:', tf_y_test[indx])
    print('Predicted:', predicted)
    predicted_index = np.argmax(predicted)
    print('Predicted Index:', tf_y_test[indx])
    
    if (tf_y_unique_labels[predicted_index] == tf_y_test[indx]):
        correct_predictions +=1
        print('Correct')
    else:
        wrong_predictions +=1
        #print(predicted)

Actual: ['tech']
Predicted: [[ 0.07594541  0.01873647  0.02610924  0.04336398  0.8358449 ]]
Predicted Index: ['tech']
Correct
Actual: ['sport']
Predicted: [[ 0.00359696  0.0049514   0.0062612   0.95323075  0.03195968]]
Predicted Index: ['sport']
Correct
Actual: ['politics']
Predicted: [[ 0.12737835  0.00542857  0.79798629  0.00285849  0.0663483 ]]
Predicted Index: ['politics']
Correct
Actual: ['entertainment']
Predicted: [[ 0.01049312  0.91907723  0.01152179  0.02148664  0.03742122]]
Predicted Index: ['entertainment']
Correct
Actual: ['sport']
Predicted: [[ 0.00501617  0.02536706  0.01531949  0.94063843  0.01365884]]
Predicted Index: ['sport']
Correct
Actual: ['politics']
Predicted: [[ 0.02064326  0.01114416  0.94809488  0.00882092  0.01129678]]
Predicted Index: ['politics']
Correct
Actual: ['tech']
Predicted: [[ 0.0896562   0.02802303  0.01766989  0.02625103  0.83839985]]
Predicted Index: ['tech']
Correct
Actual: ['tech']
Predicted: [[ 0.01159516  0.32349764  0.01667276  0.00882974  0

Predicted: [[ 0.02677576  0.00967519  0.04241799  0.02334636  0.89778469]]
Predicted Index: ['tech']
Correct
Actual: ['business']
Predicted: [[ 0.9101043   0.01972947  0.01010169  0.01320271  0.04686182]]
Predicted Index: ['business']
Correct
Actual: ['tech']
Predicted: [[ 0.0532943   0.04512961  0.02554103  0.01380454  0.86223052]]
Predicted Index: ['tech']
Correct
Actual: ['tech']
Predicted: [[ 0.05039936  0.01542608  0.07755683  0.01775416  0.83886356]]
Predicted Index: ['tech']
Correct
Actual: ['entertainment']
Predicted: [[ 0.05618864  0.85911272  0.01278929  0.01410218  0.05780717]]
Predicted Index: ['entertainment']
Correct
Actual: ['entertainment']
Predicted: [[ 0.15767329  0.27297885  0.03383651  0.02311209  0.51239926]]
Predicted Index: ['entertainment']
Actual: ['business']
Predicted: [[ 0.89332774  0.01546218  0.06407935  0.01667367  0.01045707]]
Predicted Index: ['business']
Correct
Actual: ['sport']
Predicted: [[ 0.05185184  0.03696843  0.13061559  0.76810681  0.01245732]

### TFIDF K-NN Results:

In [18]:
print('Total Test Records : {}'.format(tf_X_test.shape[0]))
print('Total Correct Predictions : {}'.format(correct_predictions))
print('Total Wrong Predictions : {}'.format(wrong_predictions))
tf_knn_rate = (correct_predictions/tf_X_test.shape[0])* 100
print('Success Rate : {}%'.format(tf_knn_rate))

Total Test Records : 125
Total Correct Predictions : 122
Total Wrong Predictions : 3
Success Rate : 97.6%


## DFICF K-NN Processing

In [19]:
clf.fit(df_X_train, df_y_train_encoded)

CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=5, method='sigmoid')

In [20]:
#KNN Labels
clf.classes_

array([0, 1, 2, 3, 4])

In [21]:
#KNN Encoded Labels
df_y_unique_labels

array(['business', 'entertainment', 'politics', 'sport', 'tech'], 
      dtype='<U13')

In [22]:
correct_predictions = 0
wrong_predictions = 0
window_size = df_y_unique_labels.shape[0]
# iteration should be achieved in every jump_step/block like window_size, 
# because, df_X_test is ordered by the document index, or in other terms same document
# same document is getting repearted in each window
# thus, in each window, we need to iterate again for each document in this window.
for step in range(0, df_X_test.shape[0], window_size):
    max_predicted = None
    max_predicted_value = 0
    print(10 * '*')
    for indx in range(step, step + window_size):
        print('step:', step)
        print('index:', indx)
        print('record:', df_X_test[indx])
        print('label:', df_potential_category_test[indx])
        df_X_sample_record = df_X_test[indx].reshape(1, df_X_test[indx].shape[0])
        #sample_indx = 0
        #tf_X_sample = tf_X_test[sample_indx].reshape(1, tf_X_test[sample_indx].shape[0])
        predicted = clf.predict_proba(df_X_sample_record)
        predicted_value = np.max(predicted)
        print('Predictions in Window:', predicted)
        if predicted_value > max_predicted_value:
            max_predicted_value = predicted_value
            max_predicted = predicted
    
#     print('Max predicted:', max_predicted)
#     print('Max predicted value:', max_predicted_value)
    max_predicted_index = np.argmax(max_predicted)
#     print('Max predicted index:', max_predicted_index)
    
#     print('Predicted:', tf_y_unique_labels[max_predicted_index])
#     print('Actual:', df_y_test[step])
    if (df_y_unique_labels[max_predicted_index] == df_y_test[step]):
        correct_predictions +=1
    else:
        wrong_predictions +=1
        print('Max predicted:', max_predicted)
        print('Max predicted value:', max_predicted_value)
        print('Max predicted index:', max_predicted_index)
        print('Predicted:', tf_y_unique_labels[max_predicted_index])
        print('Actual:', df_y_test[step])
        print('Wrong','!'* 10)

**********
step: 0
index: 0
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
Predictions in Window: [[ 0.71882659  0.08821675  0.07028853  0.05801412  0.06465401]]
step: 0
index: 1
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.47685398  0.16511475  0.1253461   0.13965158  0.09303359]]
step: 0
index: 2
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.54114321  0.13103063  0.10903277  0.08422942  0.13456397]]
step: 0
index: 3
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.49157887  0.18165118  0.1194485   0.11190629  0.09541517]]
step: 0
index: 4
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.50585017  0.15188324  0.13048587  0.10856321  0.10321751]]
**********
step: 5
index: 5
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.43845332  0.1553611   0.14851514  0.1239267   0.13374373]]
st

Predictions in Window: [[ 0.16475835  0.11350136  0.11501895  0.48441476  0.12230657]]
step: 65
index: 66
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.17228055  0.40300148  0.1184295   0.10746851  0.19881996]]
step: 65
index: 67
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
Predictions in Window: [[ 0.65836646  0.08215944  0.0908339   0.0798106   0.0888296 ]]
step: 65
index: 68
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.00656816  0.02894103  0.03911731  0.03105858  0.89431492]]
step: 65
index: 69
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.3708572   0.13933095  0.22525114  0.12361996  0.14094075]]
**********
step: 70
index: 70
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.02481671  0.03602095  0.04580174  0.85392213  0.03943847]]
step: 70
index: 71
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
P

Predictions in Window: [[ 0.01173182  0.04375972  0.86980116  0.03449154  0.04021576]]
step: 115
index: 117
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.3591943   0.15590583  0.20653196  0.15164436  0.12672355]]
step: 115
index: 118
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.40499528  0.18020383  0.15335439  0.16096022  0.10048629]]
step: 115
index: 119
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.36673302  0.17061627  0.15442242  0.20793718  0.10029111]]
**********
step: 120
index: 120
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.46346378  0.16702825  0.14477865  0.13252787  0.09220145]]
step: 120
index: 121
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.28353271  0.36033383  0.11065437  0.09678119  0.14869791]]
step: 120
index: 122
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['ente

Predictions in Window: [[ 0.56381633  0.15145075  0.10857155  0.091216    0.08494537]]
step: 175
index: 176
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.50206011  0.16643821  0.12045128  0.12019931  0.09085108]]
step: 175
index: 177
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.47730259  0.17491435  0.13372866  0.09550159  0.11855281]]
step: 175
index: 178
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.39686481  0.30659841  0.11237727  0.09374653  0.09041298]]
step: 175
index: 179
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.48345926  0.16574929  0.14203854  0.10475993  0.10399298]]
Max predicted: [[ 0.56381633  0.15145075  0.10857155  0.091216    0.08494537]]
Max predicted value: 0.563816328418
Max predicted index: 0
Predicted: business
Actual: ['entertainment']
Wrong !!!!!!!!!!
**********
step: 180
index: 180
record: [ 0.  

label: ['sport']
Predictions in Window: [[ 0.47169152  0.16163097  0.1249115   0.14176514  0.10000087]]
step: 250
index: 251
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.02544338  0.02586408  0.03900123  0.02031021  0.88938111]]
step: 250
index: 252
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.47855521  0.17293109  0.11974809  0.11545821  0.1133074 ]]
step: 250
index: 253
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
Predictions in Window: [[ 0.67164851  0.08967289  0.08978955  0.0632952   0.08559386]]
step: 250
index: 254
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.49464333  0.12444645  0.16336867  0.08994345  0.1275981 ]]
**********
step: 255
index: 255
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.47091236  0.15938957  0.11813558  0.15549628  0.09606622]]
step: 255
index: 256
record: [ 0.  0.  0. ...,  0.  0

Max predicted: [[ 0.71216775  0.07662509  0.08094462  0.07327901  0.05698353]]
Max predicted value: 0.712167752515
Max predicted index: 0
Predicted: business
Actual: ['sport']
Wrong !!!!!!!!!!
**********
step: 305
index: 305
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.42765479  0.15915634  0.13469788  0.17643134  0.10205965]]
step: 305
index: 306
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
Predictions in Window: [[ 0.72047653  0.07266937  0.07515261  0.06785073  0.06385075]]
step: 305
index: 307
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.43584102  0.16959453  0.13809412  0.15094266  0.10552767]]
step: 305
index: 308
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.48685374  0.13557126  0.15030378  0.12300811  0.10426311]]
step: 305
index: 309
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.12032889  0.06344866  

record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.46937633  0.16391826  0.12857207  0.14286411  0.09526924]]
Max predicted: [[ 0.82211871  0.04422003  0.05483841  0.03184077  0.04698208]]
Max predicted value: 0.822118711778
Max predicted index: 0
Predicted: business
Actual: ['politics']
Wrong !!!!!!!!!!
**********
step: 365
index: 365
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.50733441  0.15133939  0.13361681  0.11642881  0.09128058]]
step: 365
index: 366
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
Predictions in Window: [[ 0.79728224  0.05930936  0.05700871  0.04125924  0.04514045]]
step: 365
index: 367
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.58421064  0.12863206  0.10766312  0.09278783  0.08670635]]
step: 365
index: 368
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.4744647   0.16326745  0.1350658   0.13363689  0.

record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.12617343  0.11132081  0.57054497  0.09152233  0.10043846]]
step: 415
index: 416
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.42889455  0.19373336  0.14496897  0.12577203  0.10663109]]
step: 415
index: 417
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.37116314  0.16191209  0.1664511   0.1262641   0.17420956]]
step: 415
index: 418
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.41388589  0.16954557  0.14522817  0.16640739  0.10493298]]
step: 415
index: 419
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
Predictions in Window: [[ 0.51689067  0.14571809  0.13955428  0.10472753  0.09310943]]
**********
step: 420
index: 420
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.43151611  0.142602    0.17384278  0.09813373  0.15390539]]
step: 420
inde

Predictions in Window: [[ 0.55621143  0.13857205  0.11925549  0.10094086  0.08502017]]
step: 465
index: 469
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.56507109  0.08403058  0.07030311  0.05576026  0.22483496]]
**********
step: 470
index: 470
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.05876821  0.08144486  0.06602579  0.74942847  0.04433268]]
step: 470
index: 471
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
Predictions in Window: [[ 0.38580279  0.20256595  0.14086111  0.16956765  0.1012025 ]]
step: 470
index: 472
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.32663148  0.22864579  0.13910298  0.20240413  0.10321563]]
step: 470
index: 473
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
Predictions in Window: [[ 0.36855808  0.20186438  0.14629232  0.1810594   0.10222582]]
step: 470
index: 474
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['

step: 525
index: 527
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.41579941  0.14488303  0.14299427  0.18206798  0.11425531]]
step: 525
index: 528
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
Predictions in Window: [[ 0.64660071  0.08865065  0.11765528  0.06474114  0.08235222]]
step: 525
index: 529
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.43226383  0.20201867  0.14360455  0.10986566  0.11224729]]
**********
step: 530
index: 530
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
Predictions in Window: [[ 0.80808499  0.05267916  0.05316445  0.03883372  0.04723768]]
step: 530
index: 531
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.50129806  0.16639379  0.12003996  0.11614313  0.09612507]]
step: 530
index: 532
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.61526036  0.1133939   0.0954124   0.0824573

Predictions in Window: [[ 0.02295217  0.03841264  0.04395161  0.85747841  0.03720517]]
step: 595
index: 599
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.27554238  0.13447187  0.11870251  0.3376848   0.13359843]]
**********
step: 600
index: 600
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
Predictions in Window: [[ 0.35502644  0.16027879  0.1283299   0.2501179   0.10624697]]
step: 600
index: 601
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
Predictions in Window: [[ 0.32651684  0.33658063  0.11148141  0.11399692  0.1114242 ]]
step: 600
index: 602
record: [ 0.03005935  0.          0.         ...,  0.          0.          0.        ]
label: ['politics']
Predictions in Window: [[ 0.5116267   0.13461649  0.14425988  0.10615721  0.10333971]]
step: 600
index: 603
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
Predictions in Window: [[ 0.05128963  0.0281632   0.03570542  0.03043149  0.85441027]]
step: 600
index: 604
reco

### DFICF K-NN Results: 

In [23]:
print('Total Test Records : {}'.format(df_X_test.shape[0]/window_size))
print('Total Correct Predictions : {}'.format(correct_predictions))
print('Total Wrong Predictions : {}'.format(wrong_predictions))
df_knn_rate = (correct_predictions/(df_X_test.shape[0]/window_size))* 100
print('Success Rate : {}%'.format(df_knn_rate))

Total Test Records : 125.0
Total Correct Predictions : 109
Total Wrong Predictions : 16
Success Rate : 87.2%
