In [161]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import datasets, linear_model, neighbors, svm, naive_bayes
from sklearn.neural_network import MLPRegressor
from sklearn.utils import column_or_1d
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib
import re
import sys


def loadGrade():
    csv_path = "grade.csv"
    return pd.read_csv(csv_path, index_col = 0)
# function to read from csv file
def load_data(csv_path):
	# index = not use the first column as the index (row names)
    return pd.read_csv(csv_path, index_col = 0)

# function to return course from argument 2
def read_in():
	course = sys.argv[1];
	# print('course = ',course)
	return course;

def get_rmse(model, X, y):
	model.fit(X, y)
	predictions = []
	for val in model.predict(X):
		if (val < 0.5):
			predictions.append(0)
		elif (val < 1.5):
			predictions.append(1)
		else:
			predictions.append(2)
	lin_mse = mean_squared_error(y, predictions) 
	lin_rmse = np.sqrt(lin_mse)
	return lin_rmse
	
#Load input

# inpgrade = table of input data
inpgrade = loadGrade()
# names = header of all column
names = list(inpgrade.columns.values)
course = read_in()

#Rename columns
for name in names:
    newName = re.sub(r"\(.*\)", "", name).strip()
    inpgrade.rename(columns = {name: newName}, inplace = True)
# drop the row of NaN
inpgrade = inpgrade.drop(inpgrade.index[0])
#  drop the fisrt row of point 
inpgrade = inpgrade.drop(inpgrade.index[0])

#  drop the column ID 
if ('ID' in inpgrade.columns.values):
    inpgrade = inpgrade.drop('ID', axis = 1)
#  drop the column Section
if ('Section' in inpgrade.columns.values):
    inpgrade = inpgrade.drop('Section', axis = 1)
# print(" inpgrade section = ")
# print(inpgrade)

# drop all the column of NAN
inpgrade.dropna(axis = 1, how = 'all', inplace = True)
# replace NaN value with 0
inpgrade.fillna(0, inplace = True)
inpgrade['Full name'] = inpgrade.index.values
# choose all column with type of float64 as number
chosen = inpgrade.select_dtypes(include='float64').columns.values
#fileO = open("ml_scripts/data/" + course + "/grade.json", "w")
#fileO.write(inpgrade.to_json(orient="records"))
#fileO.close() 

#Load data
#  grade = data for training
grade = load_data("../../ml_scripts/data/csce156/full-grade.csv")
# print('grade = ' , grade.head())
# intersect = ['Homework 1', 'Homework 2', 'Homework 3', ..]
# print('grade.columns = ', grade.columns)
# print('chosen = ', chosen)
intersect = [val for val in chosen if val in grade.columns.values]
print("intersect = ",intersect)
# X = all value of ['Homework 1', 'Homework 2', 'Homework 3', ..]
X = grade[intersect]
print('X = ', X.head())
# y contain all high-risk thing 
y = grade[["Grade"]].values.ravel()
y1 = []
for label in y:
	if label == "Good":
		y1.append(0)
	if label == "OK":
		y1.append(1)
	if label == "High-risk":
		y1.append(2)
print('data = ', y1)

models = []
models.append(sklearn.linear_model.LogisticRegression(solver='newton-cg',multi_class='multinomial')) # Logistic Regression
# print(" models 1 = ", models)
models.append(sklearn.naive_bayes.GaussianNB()) # Naive Bayes
# print(" models 2 = ", models)
models.append(sklearn.neighbors.KNeighborsRegressor(n_neighbors=10)) # k Nearest Neighbors
models.append(svm.SVR()) # Support Vector Machine
#models.append(MLPRegressor(hidden_layer_sizes=(60,),activation='logistic',solver='lbfgs',learning_rate='adaptive',max_iter=1000,learning_rate_init=0.01,alpha=0.01)) # Neuron network
models.append(DecisionTreeRegressor()) # Decision Tree
models.append(RandomForestRegressor()) # Random Forest

# print(" models = ", models)

chosenModels = [None]
minRMSE = float("inf")


for model in models:
	rmse = get_rmse(model, X, y1)
# 	print("model = ", model)
# 	print("rmse = ", rmse)
	if rmse < minRMSE:
		minRMSE = rmse
		chosenModels[0] = model
# print(" chosenModels = ", chosenModels)
#
lr_pred = []
nb_pred = []
# print(" models[0].predict(X) = ", models[0].predict(X))
# print("X = ", X)
for val in models[0].predict(X):
	if (val < 0.5):
# 		print("val < 0.5  = ",val)   
		lr_pred.append(0)
# 		print(" lr_pred < 0.5 = ", lr_pred)       
	elif (val < 1.5):
# 		print("val < 1.5  = ",val)   
		lr_pred.append(1)
# 		print(" lr_pred < 1.5 = ", lr_pred)
	else:
# 		print("val else  = ",val)   
		lr_pred.append(2)
# 		print(" lr_pred else= ", lr_pred)

# print("models[1].predict(X) = ",models[1].predict(X) )
        
for val in models[1].predict(X):
	if (val < 0.5):
		nb_pred.append(0)
	elif (val < 1.5):
		nb_pred.append(1)
	else:
		nb_pred.append(2)
        
lrnb_pred = []
for i in range(len(lr_pred)):
	lrnb_pred.append(max(lr_pred[i], nb_pred[i]))
# print(" lrnb_pred = ", lrnb_pred)
# print('data = ', y1)

mse = mean_squared_error(y1, lrnb_pred) 
rmse = np.sqrt(mse)
# print('rmse = ',rmse)
# print('minRMSE = ',minRMSE)
if (rmse <= minRMSE):
	chosenModels[0] = models[0]
	chosenModels.append(models[1])

#Export models to the file
for i in range(len(chosenModels)):
	joblib.dump(chosenModels[i], '../../ml_scripts/models/csce156/model' + str(i) + '.pkl')
# 	print("str(i) = ", str(i))

# print(chosenModels[0].predict([[100,100,100,100,10,10,10,10,30,10]]))
# print(chosenModels[0].predict([[0,0,0,0,0,0,0,0,0,0]]))
# print(chosenModels[0].predict([[100,90,90,70,10,10,10,10,20,10]]))

print("chosenModels[0] = " , chosenModels[0])
res = str(len(chosenModels)) + ";"
#res = ""
for feature in intersect:
	res += feature + ","
# print("res[:-1]  = ", res[:-1])
res = res[:-1] + ";"
# print("res  = ",res[:-1] + ";")
res += inpgrade.to_json(orient="records")
# print("inpgrade.to_json orient=records  =", inpgrade.to_json(orient="records"))
# print("res = ")
# print(res, end='', flush=True)


intersect =  ['Lab 1', 'Lab 2', 'Lab 3', 'Lab 4', 'Lab 5', 'Lab 6', 'Lab 7', 'Lab 8', 'Lab 9', 'Lab 10', 'Lab 11', 'Lab 12', 'Lab 13', 'Lab 14', 'Lab 15', 'Assignment 1', 'Assignment 2', 'Assignment 3', 'Assignment 4', 'Assignment 5', 'Assignment 6', 'Design Document', 'Late Pass', 'Midterm', 'Final', 'Weighted Total']
X =               Lab 1  Lab 2  Lab 3  Lab 4  Lab 5  Lab 6  Lab 7  Lab 8  Lab 9  \
days online                                                                  
0              100    0.0  100.0  100.0    100  100.0      0      0      0   
1                0  100.0  100.0  100.0     99  100.0      0    100    100   
7              100  100.0   99.5  100.0    100  100.0    100    100    100   
0              100  100.0  100.0  100.0    100  100.0    100    100    100   
19             100  100.0  100.0  100.0    100  100.0    100    100    100   

             Lab 10       ...        Assignment 2  Assignment 3  Assignment 4  \
days online               ...                 