In [None]:
import os
import warnings  
import statistics
import numpy as np
import pandas as pd 
from math import sqrt  
from sklearn import metrics
from sklearn.svm import SVC
from statistics import mode 
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
path = "./csvdata"
agg_path = "./agg"
results_path = "./csvdata/results/results_svmgs588_new.csv"
time_interval = 1288

svm_parameters = {
    'C': (1000000000, 100000, 10000, 1000, 100, 10, 1),
    'gamma': (1, .01, .0001, 'auto'),
    'kernel': ('linear', 'rbf', 'poly'),
    'degree': (2, 3, 4, 5, 6)
}

rfc_parameters = { 
    'n_estimators': [5, 10, 20],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}


In [None]:
def get_features(n):
    d = defaultdict(int)
    for i in range(400, time_interval):
        csv_path = agg_path + str(i) + ".csv"
        df = pd.read_csv(csv_path)
        X = df.iloc[:,0:305] 
        X = normalize(X)
        y = df.iloc[:,-1]
        X, y = get_downsampled_data(X, y)
        bestfeatures = SelectKBest(score_func=f_classif, k=n)
        fit = bestfeatures.fit(X,y)
        df_scores = pd.DataFrame(fit.scores_)
        best_feature_list = pd.Series(df_scores.sort_values(0)[269:305].index).tolist()
        for feature in best_feature_list: 
            d[feature] += 1
        
    final_best_feature_list = [(k, v) for k, v in d.items()] 
    final_best_feature_list = sorted(final_best_feature_list, key=lambda tup: tup[1])
    final_best_feature_list = final_best_feature_list[269:305]
    final_best_feature_list = [feature for (feature, count) in final_best_feature_list]
   
    return final_best_feature_list
    

In [None]:
def get_features2(n):
    d = defaultdict(int)
    for i in range(400, time_interval):
        csv_path = agg_path + str(i) + ".csv"
        df = pd.read_csv(csv_path)
        X = df.iloc[:,0:305] 
        X = normalize(X)
        y = df.iloc[:,-1]
        X, y = get_downsampled_data(X, y)
        rfc = RandomForestClassifier()
        rfc.fit(X, y)
        df_scores = pd.DataFrame(rfc.feature_importances_)
        best_feature_list = pd.Series(df_scores.sort_values(0)[269:305].index).tolist()
        for feature in best_feature_list: 
            d[feature] += 1
        
    final_best_feature_list = [(k, v) for k, v in d.items()] 
    final_best_feature_list = sorted(final_best_feature_list, key=lambda tup: tup[1])
    final_best_feature_list = final_best_feature_list[269:305]
    final_best_feature_list = [feature for (feature, count) in final_best_feature_list]
   
    return final_best_feature_list
    

In [None]:
def get_features3(X, n):
    pca = PCA(n_components=n)
    X_pca = pca.fit_transform(X)
    column_names = ["PC" + str(i) for i in range(1, n+1)]
    PCA_X = pd.DataFrame(data = X_pca, columns = column_names)
    return PCA_X

In [None]:
def get_features4(X, n):
    ica = FastICA(n_components=n)
    X_ica = ica.fit_transform(X)
    column_names = ["IC" + str(i) for i in range(1, n+1)]
    ICA_X = pd.DataFrame(data = X_ica, columns = column_names)
    return ICA_X

In [None]:
def get_features5(X, y, n):
    lda = LinearDiscriminantAnalysis(n_components=1)
    X_lda = lda.fit(X, Y).transform(X)
    column_names = ["LD" + str(i) for i in range(1, n+1)]
    LDA_X = pd.DataFrame(data = X_lda, columns = column_names)
    return LDA_X

In [None]:
def get_class_distribution(X, y):
    df = pd.DataFrame(X)
    df["y"] = y
    n = len(df.index)
    correct_num = len(df[df.y==0].index)
    incorrect_num = n - correct_num
    return correct_num, incorrect_num

In [None]:
def get_downsampled_data(X, y):
    correct_num, incorrect_num = get_class_distribution(X, y)
    df = pd.DataFrame(X) 
    df["y"] = y
    df_correct = df[df.y==0]
    df_incorrect = df[df.y==1]
    
    if(correct_num > incorrect_num):
        df_correct = df_correct.sample(n=incorrect_num)
    else:
        df_incorrect = df_incorrect.sample(n=correct_num)
    
    df = df_correct.append(df_incorrect)
    y = df["y"].values
    X = df.drop("y", axis=1).values
    return X, y

In [None]:
# feature_list = get_features()
def generate_model():
    accuracy_score_list = []
    destination_path = path + "/" + "results/results_svmgs1088.csv"
    for i in range(time_interval):
        file_path = agg_path + "/" + str(i) + ".csv"
        df = pd.read_csv(file_path)
        y = df["y"].values
        X = df.drop("y", axis=1).values
        X = X[:, feature_list]
        normalized_X = normalize(X)
        accuracy_score = train_model(normalized_X, y)
        accuracy_score_list.append(accuracy_score)
    
    ser = pd.Series(accuracy_score_list)
    ser.to_csv(destination_path)
    return accuracy_score_list

In [None]:
def generate_model2(iterations, sample_size):
    accuracy_score_list = []
    destination_path = path + "/" + "results/results_svmgs1088.csv"
    for i in range(time_interval):
        file_path = agg_path + "/" + str(i) + ".csv"
        df = pd.read_csv(file_path)
        
        accuracy_score_sum = 0
        for iteration in range(iterations):
            cor = df[df.y==0].sample(n=sample_size, replace=True)
            incor = df[df.y==1].sample(n=sample_size,  replace=True)
            df2 = cor.append(incor)
            y = df2["y"].values
            X = df2.drop("y", axis=1).values
            normalized_X = normalize(X)
            accuracy_score = train_model2(normalized_X, y)
            accuracy_score_sum += accuracy_score
        average_accuracy_score = accuracy_score_sum / iterations
        print(average_accuracy_score)
        accuracy_score_list.append(average_accuracy_score)
    
    ser = pd.Series(accuracy_score_list)
    ser.to_csv(destination_path)
    return accuracy_score_list

In [None]:
def generate_model3(sample_size, num_of_models):
    accuracy_score_list = []
    destination_path = path + "/" + "results/results_svmgs588_new.csv"
    for i in range(time_interval):
        file_path = agg_path + "/" + str(i) + ".csv"
        df = pd.read_csv(file_path)
        cor = df[df.y==0].sample(n=sample_size)
        incor = df[df.y==1].sample(n=sample_size)
        df2 = cor.append(incor).sample(len(cor.index) + len(incor.index)).reset_index(drop=True)
        y = df2["y"]
        X = df2.drop("y", axis=1)
        normalized_X = normalize(X)
        accuracy_score = train_model3(normalized_X, y, num_of_models)
        accuracy_score_list.append(accuracy_score)
    
    ser = pd.Series(accuracy_score_list)
    ser.to_csv(destination_path)
    return accuracy_score_list

In [None]:
def train_model(X, y):
    kf = KFold()
    accuracy_sum = 0
    for train_index, test_index in kf.split(X):
        X_int, X_test = X[train_index], X[test_index]
        y_int, y_test = y[train_index], y[test_index]
        X_train, y_train = get_downsampled_data(X_int, y_int)
        rfc = RandomForestClassifier()
        clf = GridSearchCV(rfc, model_parameters)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy_sum += metrics.accuracy_score(y_test, predictions)
    accuracy_score = accuracy_sum / 3
    print(accuracy_score)
    
    return accuracy_score

In [None]:
def train_model2(X, y):
    kf = KFold()
    accuracy_sum = 0
    for train_index, test_index in kf.split(X):
        X_int, X_test = X[train_index], X[test_index]
        y_int, y_test = y[train_index], y[test_index]
        X_train, y_train = get_downsampled_data(X_int, y_int)
        svm = SVC()
        clf = GridSearchCV(svm, svm_parameters)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy_sum += metrics.accuracy_score(y_test, predictions)
    accuracy_score = accuracy_sum / 3
    
    return accuracy_score

In [None]:
def train_model3(X, y, num_of_models):
    kf = KFold()
    accuracy_sum = 0
    for train_index, test_index in kf.split(X):
        X_int, X_test = X[train_index], X[test_index]
        y_int, y_test = y[train_index], y[test_index]
        print(X_int.shape)
        model_predictions = []
        df = pd.DataFrame(X_int)
        df["y"] = y_int.tolist()
        for i in range(num_of_models):
            df_sampled = df.sample(len(y_int), replace=True)
            y_new = df_sampled["y"]
            X_new = df_sampled.drop("y", axis=1)
            svm = SVC()
            clf = GridSearchCV(svm, svm_parameters)
            clf.fit(X_new, y_new)
            predictions = clf.predict(X_test)
            model_predictions.append(predictions)
        
        
        final_predictions = [None for i in range(len(predictions))]
        for i in range(len(predictions)):
            temp_list = []
            for j in range(len(model_predictions)):
                temp_list.append(model_predictions[j][i])
                
            final_predictions[i] = mode(temp_list)
         
        final_predictions = pd.Series(model_predictions[0])
        accuracy_sum += metrics.accuracy_score(y_test, final_predictions)
    accuracy_score = accuracy_sum / 3
    print(accuracy_score)
    return accuracy_score
    

In [None]:
def plot_series():
    ser = pd.Series.from_csv(results_path)
    x_values = ser.index
    y_values = ser.values
    poly_degree = 3
    coeffs = np.polyfit(x_values, y_values, poly_degree)
    poly_eqn = np.poly1d(coeffs)
    y_hat = poly_eqn(x_values)
    plt.figure(figsize=(12,8))
    plt.plot(x_values, y_values, "ro")
    plt.plot(x_values, y_hat)
    plt.title("Decoded Time Series")
    plt.xlabel("Time Interval")
    plt.ylabel("Predcition Accuracy")
    plt.savefig("./csvdata/results/timeseriesplot_svmgs588_new.png")

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

generate_model3(88, 3)

In [None]:
plot_series()