In [1]:
import pandas as pd
import numpy as np
import os
import json
import csv
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

In [2]:
path = "/home/jackalhan/Development/app_data/tf_idf_hiarchical"
dficf_path = os.path.join(path, 'dficf')
tfidf_path = os.path.join(path, 'tfidf')

In [3]:
## pandas style
# dficf_train_df = pd.read_csv(os.path.join(dficf_path, 'dficf_train_all_in_one.csv'), skipinitialspace=True, 
#                              converters={'vector':lambda x: json.loads(x)})
# dficf_test_df = pd.read_csv(os.path.join(dficf_path, 'dficf_test_all_in_one.csv'))

# tfidf_train_df = pd.read_csv(os.path.join(tfidf_path, 'tfidf_train_all_in_one.csv'))
# tfidf_test_df = pd.read_csv(os.path.join(tfidf_path, 'tfidf_test_all_in_one.csv'))

In [4]:
#DFICF Train
with open(os.path.join(dficf_path, 'dficf_train_all_in_one.csv'), 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    header = next(reader)
    _X_train = []
    _y_train = []
    for x in reader:
        _X_train.append(json.loads(x[1]))
        _y_train.append(x[2])
    df_X_train = np.array(_X_train)
    df_y_train = np.array(_y_train).reshape(len(_y_train),1)
    del _X_train
    del _y_train
    
#DFICF Test
with open(os.path.join(dficf_path, 'dficf_test_all_in_one.csv'), 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    header = next(reader)
    # ------------------------------------------------
    # reader needs to be sorted by actual index value in order to group same documents 
    # ------------------------------------------------
    sorted_reader = sorted(reader, key=lambda row: int(row[4]))
    
    _X_test = []
    _y_test = []
    _potential_category_test = []
    _actual_index_test = []
    for x in sorted_reader:
        _X_test.append(json.loads(x[1]))
        _potential_category_test.append(x[3])
        _actual_index_test.append(int(x[4]))
        _y_test.append(x[5])
    
    df_X_test = np.array(_X_test)
    df_y_test = np.array(_y_test).reshape(len(_y_test),1)
    df_potential_category_test = np.array(_potential_category_test).reshape(len(_potential_category_test),1)
    df_actual_index_test = np.array(_actual_index_test).reshape(len(_actual_index_test),1)
    ## ----------------------------------------------------
    # Actual_index is going to be used to group them so that each index can be 
    # trained to get
    # the best result and compare it with the actual y_test
    ## ----------------------------------------------------
    del _X_test
    del _y_test
    del _potential_category_test
    del _actual_index_test
    
#TFIDF Train
with open(os.path.join(tfidf_path, 'tfidf_train_all_in_one.csv'), 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    header = next(reader)
    _X_train = []
    _y_train = []
    for x in reader:
        _X_train.append(json.loads(x[1]))
        _y_train.append(x[2])
    tf_X_train = np.array(_X_train)
    tf_y_train = np.array(_y_train).reshape(len(_y_train),1)
    del _X_train
    del _y_train
        
#TFIDF Test
with open(os.path.join(tfidf_path, 'tfidf_test_all_in_one.csv'), 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    header = next(reader)
    _X_test = []
    _y_test = []
    for x in reader:
        _X_test.append(json.loads(x[1]))
        _y_test.append(x[2])
    tf_X_test = np.array(_X_test)
    tf_y_test = np.array(_y_test).reshape(len(_y_test),1)
    del _X_test
    del _y_test

## Preprocessing for TFIDF

In [5]:
# TFIDF
tf_y_unique_labels = np.unique(tf_y_train)
tf_le = preprocessing.LabelEncoder()
tf_le.fit(tf_y_unique_labels)

LabelEncoder()

In [6]:
tf_y_train_encoded = tf_le.transform(tf_y_train)

  y = column_or_1d(y, warn=True)


In [7]:
tf_y_test_encoded = tf_le.transform(tf_y_test)

  y = column_or_1d(y, warn=True)


## Preprocessing for DFICF

In [8]:
# TFIDF
df_y_unique_labels = np.unique(df_y_train)
df_le = preprocessing.LabelEncoder()
df_le.fit(df_y_unique_labels)

LabelEncoder()

In [9]:
df_y_train_encoded = df_le.transform(df_y_train)

  y = column_or_1d(y, warn=True)


In [10]:
df_y_test_encoded = df_le.transform(df_y_test)

  y = column_or_1d(y, warn=True)


# K-NearestNeighbours Classifier

In [11]:
knn = KNeighborsClassifier()

## TFIDF K-NN Processing

In [12]:
knn.fit(tf_X_train, tf_y_train_encoded)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [13]:
#KNN Labels
knn.classes_

array([0, 1, 2, 3, 4])

In [14]:
#KNN Encoded Labels
tf_y_unique_labels

array(['business', 'entertainment', 'politics', 'sport', 'tech'], 
      dtype='<U13')

In [15]:
correct_predictions = 0
wrong_predictions = 0
for indx, _ in enumerate(tf_X_test):
    tf_X_sample_record = _.reshape(1, _.shape[0])
    #sample_indx = 0
    #tf_X_sample = tf_X_test[sample_indx].reshape(1, tf_X_test[sample_indx].shape[0])
    predicted = knn.predict_proba(tf_X_sample_record)
    predicted_index = np.argmax(predicted)
    if (tf_y_unique_labels[predicted_index] == tf_y_test[indx]):
        correct_predictions +=1
    else:
        wrong_predictions +=1
        #print(predicted)

### TFIDF K-NN Results:

In [16]:
print('Total Test Records : {}'.format(tf_X_test.shape[0]))
print('Total Correct Predictions : {}'.format(correct_predictions))
print('Total Wrong Predictions : {}'.format(wrong_predictions))
tf_knn_rate = (correct_predictions/tf_X_test.shape[0])* 100
print('Success Rate : {}%'.format(tf_knn_rate))

Total Test Records : 125
Total Correct Predictions : 117
Total Wrong Predictions : 8
Success Rate : 93.60000000000001%


## DFICF K-NN Processing

In [17]:
knn.fit(df_X_train, df_y_train_encoded)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [18]:
#KNN Labels
knn.classes_

array([0, 1, 2, 3, 4])

In [19]:
#KNN Encoded Labels
df_y_unique_labels

array(['business', 'entertainment', 'politics', 'sport', 'tech'], 
      dtype='<U13')

In [20]:
correct_predictions = 0
wrong_predictions = 0
window_size = df_y_unique_labels.shape[0]
# iteration should be achieved in every jump_step/block like window_size, 
# because, df_X_test is ordered by the document index, or in other terms same document
# same document is getting repearted in each window
# thus, in each window, we need to iterate again for each document in this window.
for step in range(0, df_X_test.shape[0], window_size):
    max_predicted = None
    max_predicted_value = 0
    print(10 * '*')
    for indx in range(step, step + window_size):
        print('step:', step)
        print('index:', indx)
        print('record:', df_X_test[indx])
        print('label:', df_potential_category_test[indx])
        df_X_sample_record = df_X_test[indx].reshape(1, df_X_test[indx].shape[0])
        #sample_indx = 0
        #tf_X_sample = tf_X_test[sample_indx].reshape(1, tf_X_test[sample_indx].shape[0])
        predicted = knn.predict_proba(df_X_sample_record)
        predicted_value = np.max(predicted)
        print('prediction:', predicted)
        if predicted_value > max_predicted_value:
            max_predicted_value = predicted_value
            max_predicted = predicted
    
    print('Max predicted:', max_predicted)
    print('Max predicted value:', max_predicted_value)
    max_predicted_index = np.argmax(max_predicted)
    print('Max predicted index:', max_predicted_index)
    
    print('Predicted:', df_y_unique_labels[max_predicted_index])
    print('Actual:', df_y_test[step])
    if (df_y_unique_labels[max_predicted_index] == df_y_test[step]):
        correct_predictions +=1
    else:
        wrong_predictions +=1
        #print(predicted)

**********
step: 0
index: 0
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 0
index: 1
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 0
index: 2
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 0
index: 3
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 0
index: 4
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 1.  0.  0.  0.  0.]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['business']
**********
step: 5
index: 5
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 0.8  0.   0.   0.   0.2]]
step: 5
index: 6
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 5
index: 7
record: [ 0.  0.  0. ...,  0.  0.  0.]

prediction: [[ 0.8  0.   0.2  0.   0. ]]
step: 65
index: 68
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 0.2  0.   0.8  0.   0. ]]
step: 65
index: 69
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 1.  0.  0.  0.  0.]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['politics']
**********
step: 70
index: 70
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 70
index: 71
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 70
index: 72
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 70
index: 73
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 70
index: 74
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0.  0.  0.]]
Max predicted:

prediction: [[ 0.  0.  1.  0.  0.]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['politics']
**********
step: 135
index: 135
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 135
index: 136
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 135
index: 137
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 135
index: 138
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 135
index: 139
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['business']
**********
step: 140
index: 140
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [

prediction: [[ 1.  0.  0.  0.  0.]]
step: 195
index: 197
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 195
index: 198
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 195
index: 199
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 0.  0.  0.  0.  1.]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['tech']
**********
step: 200
index: 200
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 200
index: 201
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 200
index: 202
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 200
index: 203
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 200
i

prediction: [[ 1.  0.  0.  0.  0.]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['sport']
**********
step: 265
index: 265
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 265
index: 266
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 265
index: 267
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 0.6  0.   0.   0.4  0. ]]
step: 265
index: 268
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 265
index: 269
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 1.  0.  0.  0.  0.]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['sport']
**********
step: 270
index: 270
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0

prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 330
index: 333
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 0.  1.  0.  0.  0.]]
step: 330
index: 334
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 0.4  0.6  0.   0.   0. ]]
Max predicted: [[ 0.  1.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 1
Predicted: entertainment
Actual: ['entertainment']
**********
step: 335
index: 335
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 335
index: 336
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 0.6  0.   0.   0.   0.4]]
step: 335
index: 337
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 335
index: 338
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 335
index: 339
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 1.  0.  0.

prediction: [[ 0.8  0.   0.2  0.   0. ]]
Max predicted: [[ 0.  0.  1.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 2
Predicted: politics
Actual: ['politics']
**********
step: 400
index: 400
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 400
index: 401
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 400
index: 402
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 400
index: 403
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 0.6  0.4  0.   0.   0. ]]
step: 400
index: 404
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 0.4  0.6  0.   0.   0. ]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['entertainment']
**********
step: 405
index: 405
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: 

prediction: [[ 1.  0.  0.  0.  0.]]
step: 465
index: 469
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 1.  0.  0.  0.  0.]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['business']
**********
step: 470
index: 470
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 0.4  0.   0.   0.6  0. ]]
step: 470
index: 471
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 470
index: 472
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 470
index: 473
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 470
index: 474
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
Max predicted: [[ 0.8  0.2  0.   0.   0. ]]
Max predicted value: 0.8
Max predicted index: 0
Predicted: bus

prediction: [[ 0.8  0.2  0.   0.   0. ]]
step: 530
index: 533
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 0.6  0.   0.   0.   0.4]]
step: 530
index: 534
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['tech']
**********
step: 535
index: 535
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 0.6  0.4  0.   0.   0. ]]
step: 535
index: 536
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 535
index: 537
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 535
index: 538
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 535
index: 539
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 0.8  0.2  0.   0.   0. ]]
M

prediction: [[ 1.  0.  0.  0.  0.]]
step: 600
index: 601
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['sport']
prediction: [[ 0.8  0.   0.   0.2  0. ]]
step: 600
index: 602
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 600
index: 603
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['tech']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 600
index: 604
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 1.  0.  0.  0.  0.]]
Max predicted: [[ 1.  0.  0.  0.  0.]]
Max predicted value: 1.0
Max predicted index: 0
Predicted: business
Actual: ['sport']
**********
step: 605
index: 605
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['entertainment']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 605
index: 606
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['business']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 605
index: 607
record: [ 0.  0.  0. ...,  0.  0.  0.]
label: ['politics']
prediction: [[ 1.  0.  0.  0.  0.]]
step: 60

### DFICF K-NN Results: 

In [21]:
print('Total Test Records : {}'.format(df_X_test.shape[0]/window_size))
print('Total Correct Predictions : {}'.format(correct_predictions))
print('Total Wrong Predictions : {}'.format(wrong_predictions))
df_knn_rate = (correct_predictions/(df_X_test.shape[0]/window_size))* 100
print('Success Rate : {}%'.format(df_knn_rate))

Total Test Records : 125.0
Total Correct Predictions : 51
Total Wrong Predictions : 74
Success Rate : 40.8%
