In [198]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


In [332]:
class Automated_SAR_Grading:
    def __init__(self, df, doc, answer_row = -1):
        self.doc = doc.iloc[:-1,:].copy()
        self.df = df.copy()
        self.clf_df = df.iloc[:-1,:].copy()
        self.answer_row = answer_row
        self.model = KMeans(n_clusters = 2)
        self.clf = LogisticRegression()
        self.closest = []
        self.furthest = []
        self.X = pd.DataFrame()
        self.X_train = pd.DataFrame()
        self.y_train = np.array([])
        self.X_test = pd.DataFrame()
        self.y_pred = []
        self.y_true = []
        self.accuracy = 0
        self.cm = []
    
    def cluster(self):
        self.model = self.model.fit(self.df)
        self.df['clusters'] = self.model.labels_
        self.correct_cluster_labels(self.answer_row)
        
    def find_train_set_idxs(self):
        i = 0
        self.df['distances'] = 0
        for value in self.df.iloc[self.answer_row, :].values[:-1]:
            self.df['distances'] = self.df['distances'] + ((self.df.iloc[:, i] - value) ** 2)
            i+=1
        self.df['distances'] = self.df['distances'] ** 0.5
        self.df.loc[int(self.answer_row),'distances'] = -1
        self.closest = self.df[(self.df.clusters == 1) & (self.df.distances > 0)].nsmallest(3, 'distances').index.values.tolist()
        self.closest.append(self.answer_row)
        self.furthest = self.df[(self.df.clusters == 0) & (self.df.distances > 0)].nlargest(3, 'distances').index.values.tolist()
    
    def correct_cluster_labels(self, answer_row):
        """
        Assumes last row is teachers answer
        """
        cluster_label = self.df.iloc[answer_row, -1]
        print()
        if 1 != cluster_label:
            self.df['clusters'] = (self.df['clusters'] - 1)**2
            
    def create_train_test_sets(self):
        X_correct = self.clf_df[self.clf_df.index.isin(self.closest)].copy()
        X_correct['label'] = 1
        X_incorrect = self.clf_df[self.clf_df.index.isin(self.furthest)].copy()
        X_incorrect['label'] = 0
        self.X = pd.concat([X_correct, X_incorrect])
        self.X_train = self.X.iloc[:,:-1]
        self.y_train = self.X.iloc[:,-1]
        
        self.X_test = self.clf_df[~self.clf_df.index.isin(self.closest + self.furthest)].copy()
        print(len(self.X_test))
    
    def classify(self):
        self.clf = self.clf.fit(self.X_train, self.y_train)
        self.y_pred = self.clf.predict(self.clf_df)
        self.y_true = self.doc['label'].values
        return self.y_pred, self.y_true
    
    def score(self):
        return accuracy_score(self.y_true, self.y_pred)
    
    def confusion_mtx(self):
        return confusion_matrix(self.y_true, self.y_pred)
    
    def run(self):
        
        
        """
        1. Make sure data is good
        2. Clustering
        3. Grab closest 3 rows and furthest 3 indexes
        4. Label them
        5. Define train and test sets
        6. Logistic Regression
        7. Score

        """

In [333]:
data = pd.read_csv(
    '../data/feature_sets/question100data.csv'
)
doc = pd.read_csv(
    '../data/feature_sets/question100doc.csv'
)

In [334]:
sar = Automated_SAR_Grading(data, doc, len(data)-1)

In [335]:
sar.cluster()




In [336]:
sar.find_train_set_idxs()

In [337]:
sar.create_train_test_sets()

30


In [338]:
sar.classify()

(array([0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 array([0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))

In [339]:
sar.y_true

array([0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [340]:
sar.y_pred

array([0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)