# HW-5: Malware Classification (Due 5th January, 2023)

**Instructions:**

Suppose your company is struggling with a series of computer virus attacks for the past several months. The viruses were grouped into a few types with some effort. However, it takes a long time to sort out what kind of virus it is when been hit with. Thus, as a senior IT department member, you undertook a project to classify the virus as quickly as possible. You've been given a dataset of the features that may be handy (or not), and  also the associated virus type (target variable). 

You are supposed to try different classification methods and apply best practices we have seen in the lectures such as grid search, cross validation, regularization etc. To increase your grade you can add more elaboration such as using ensembling or exploiting feature selection/extraction techniques. **An evaluation rubric is provided.**

Please prepare a python notebook that describes the steps, present the results as well as your comments. 

You can download the data (csv file) [here](https://drive.google.com/file/d/1yxbibzUU8bjOyChDVFPfQ4viLduYdk29/view?usp=sharing).


In [None]:

#Kutay Özbay 270201017 HW5

#Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt
import io
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier




In [None]:
#In this work listed classification methods are used;
#Logistic Regression
#Decision Tree
#K-Nearest Neighbours
#Random Forest
#Support Vector Machine
#KMeans
#And Ensembling between LR K-NN and Decision Tree

In this work listed classification methods are used;
1- Logistic Regression
2- Decision Tree
3-

In [None]:
uploaded = files.upload()

In [None]:
#Reading Data
virus_df = pd.read_csv(io.StringIO(uploaded['hw5_data.csv'].decode('utf-8')))


In [None]:
#Splitting data into target values and features
y = virus_df['target']
X = virus_df.drop('target', axis=1)


In [None]:
#Feature Selection using mutual information with Filter Methods
mf_select = GenericUnivariateSelect(score_func=mutual_info_classif, mode="k_best", param=100)
mf_select.fit(X, y)
X_mf = mf_select.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_mf,y,test_size = 0.2)

In [None]:
#Logistic Regression with hyper parameter search 
lg_reg = modeling_pipeline = Pipeline([('scaling', StandardScaler()),
                                       ('model', LogisticRegression(solver='liblinear'))])

params = {'model__C': [0.1, 1, 10, 20, 50, 100, 1000]}

virus_search = GridSearchCV(estimator=lg_reg, param_grid=params, scoring='accuracy', cv=5, refit=True)

virus_search.fit(X_train,y_train)

print(virus_search.best_estimator_)

print(f'Validation score: {virus_search.best_score_:.2%}')
print('Test score:' )
print(virus_search.score(X_test, y_test))

#Below there is a version with LDA although LDA lowers performence it makes algorithm faster.
"""
p = Pipeline([('lda', LDA()),
              ('model', LogisticRegression(solver='liblinear'))
             ])

params = {'model__C': [0.1, 1, 10, 20, 50, 100, 1000], 'lda__n_components': [1, 2, 3, 4, 5, 6, 7, 8]}

virus_search = GridSearchCV(p, param_grid=params, scoring='accuracy', cv=5, refit=True)

virus_search.fit(X_train,y_train)

print(virus_search.best_estimator_)

print(f'Validation score: {virus_search.best_score_:.2%}')
print('Test score:' )
print(virus_search.score(X_test, y_test))

"""

In [None]:
#Decision Tree with using GridSearch for trying hyperparameters systematically
std_slc = StandardScaler()
dec_tree = DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('dec_tree', dec_tree)])

criterion = ['gini', 'entropy']
max_depth = [2,4,6,8,10,12]

parameters = dict(dec_tree__criterion=criterion,
                  dec_tree__max_depth=max_depth)

tree_GS = GridSearchCV(pipe, parameters)
tree_GS.fit(X_train, y_train)

print('Best Criterion:',tree_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', tree_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print(); print(tree_GS.best_estimator_.get_params()['dec_tree'])

print(tree_GS.predict(X_test)==y_test)

In [None]:
#K-Nearest Neighbours


knn = KNeighborsClassifier()

k_range = list(range(1, 31))

param_grid = dict(n_neighbors=k_range)
  
# defining parameter range
knn_grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', return_train_score=False,verbose=1)
  
# fitting the model for grid search
grid_search= knn_grid.fit(x_train, y_train)


print(grid_search.best_params_)

print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy))



knn = KNeighborsClassifier(n_neighbors=grid_search.best_params_.get('n_neighbors'))

knn.fit(X_train, y_train)

y_test_hat=knn.predict(X_test) 

test_accuracy=accuracy_score(y_test,y_test_hat)*100

print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy))

In [None]:
#Random Forest

# First, lets vote among all 3 model types
classifiers = [DecisionTreeClassifier(),
               BaggingClassifier(DecisionTreeClassifier(), n_estimators=100, max_samples=0.5, oob_score=True),
               RandomForestClassifier(n_estimators=100, oob_score=True)]
names = ["DecisionTree", "Bagged Trees", "RandomForest"]


for model, m_name in zip(classifiers, names):
    model.fit(X_train, y_train)
    print("\t", m_name, accuracy_score(y_test, model.predict(X_test)))  

In [None]:
#Support Vector Machine

svm_m = modeling_pipeline = Pipeline([('scaling', StandardScaler()),
                                       ('model', SVC())])

param_grid = [
  {'model__C': [0.01, 0.1, 1, 10, 100, 1000], 'model__kernel': ['linear','rbf']}
 ]

svm_results = GridSearchCV(estimator=svm_m, param_grid=param_grid, scoring='accuracy', refit=True, cv=5)
svm_results = svm_results.fit(X_train, y_train)

ConfusionMatrixDisplay.from_estimator(svm_results, X_test, y_test)
plt.show()

svm_score = svm_results.score(X_test, y_test)

print(f'Support Vector Machine Score: {svm_score:.2%}')

In [None]:
#KMeans

#Elbow method
scores = []
for i in range(1,11):
    k_means = KMeans(n_clusters=i)
    k_means.fit(X_train)
    scores.append( -k_means.score(X_train) )

#Plot elbow curve as line
plt.plot(np.arange(1,11),scores)
plt.ylabel('Error')
plt.xlabel('Clusters (k)')
plt.show()

k_means_optimum = KMeans(n_clusters = 8)
y = k_means_optimum.fit_predict(X_test)

sum = 0
for i in range(len(y)):
    if y[i] == y_test[i]:
        sum += 1
print(sum / len(y))

In [None]:
#Ensembling

#Using LR K-NN and Decision Tree
classifiers = [LogisticRegression(solver='liblinear'), KNeighborsClassifier(n_neighbors=5), DecisionTreeClassifier()]
names = ["LR", "5-NN", "DecisionTree"]


classifiers.append( VotingClassifier([ (type(x).__name__, x) for x in classifiers ]) )
names.append("Vote(LR,5-NN,DT)")

classifiers.append( VotingClassifier([ (x, LogisticRegression(solver='liblinear', multi_class='auto') ) for x in "abc"  ]) )
names.append("Vote(LRx3)")

classifiers.append( VotingClassifier([ (str(x), KNeighborsClassifier() ) for x in range(3) ]) )
names.append("Vote(5-NNx3)")

  

for model, m_name in zip(classifiers, names):
    model.fit(X_train, y_train)
    print("\t", m_name, accuracy_score(y_test, model.predict(X_test)))