In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
def split(df,label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.25, random_state=42)
    return X_tr, X_te, Y_tr, Y_te

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score


classifiers = ['LinearSVM', 'RadialSVM',
               'Logistic',  'RandomForest',
               'AdaBoost',  'DecisionTree',
               'KNeighbors','GradientBoosting']

models = [svm.SVC(kernel='linear'),
          svm.SVC(kernel='rbf'),
          LogisticRegression(max_iter = 1000),
          RandomForestClassifier(n_estimators=200, random_state=0),
          AdaBoostClassifier(random_state = 0),
          DecisionTreeClassifier(random_state=0),
          KNeighborsClassifier(),
          GradientBoostingClassifier(random_state=0)]
models = [DecisionTreeClassifier(random_state=0)] #test dt
def acc_score(df,label):
    Score = pd.DataFrame({"Classifier":classifiers})
    j = 0
    acc = []
    X_train,X_test,Y_train,Y_test = split(df,label)
    for i in models:
        model = i
        model.fit(X_train,Y_train)
        predictions = model.predict(X_test)
        acc.append(accuracy_score(Y_test,predictions))
        j = j+1
    Score["Accuracy"] = acc
    Score.sort_values(by="Accuracy", ascending=False,inplace = True)
    Score.reset_index(drop=True, inplace=True)
    return Score

def plot(score,x,y,c = "b"):
    gen = [1,2,3,4,5]
    plt.figure(figsize=(6,4))
    ax = sns.pointplot(x=gen, y=score,color = c )
    ax.set(xlabel="Generation", ylabel="Accuracy")
    ax.set(ylim=(x,y))

In [2]:
# Adapted Version
def initialization_of_population(size, n_feat):
    population = np.random.rand(size, n_feat) < 0.3
    return population.astype(np.bool_)

def fitness_score(population):
    scores = []
    for chromosome in population:
        logmodel.fit(X_train.iloc[:,chromosome],Y_train)
        predictions = logmodel.predict(X_test.iloc[:,chromosome])
        score = accuracy_score(Y_test,predictions)
        scores.append((score, chromosome))
    sorted_array = sorted(scores, key=lambda x: x[0],reverse = True)
    sorted_scores, sorted_population = zip(*sorted_array)
    return list(sorted_scores), list(sorted_population)

def selection(sorted_population,parent_chromosome_number):
    next_generation_population = []
    for i in range(parent_chromosome_number):
        next_generation_population.append(sorted_population[i])
    return next_generation_population

def crossover(parent_population):
    next_generation_population = []
    next_generation_population.extend(parent_population)
    for i in range(0,len(parent_population),2):
        new_chromosome = []
        child_1 = parent_population[i]
        child_2 = parent_population[i+1]
        crossover_point = len(child_1)//2
        new_chromosome = np.concatenate((child_1[:crossover_point],child_2[crossover_point:]))
        next_generation_population.append(new_chromosome)
    return next_generation_population

def mutation(population, mutation_rate,n_feat):
    next_generation_population = []
    mutation_number = int(mutation_rate*n_feat)
    for index,chromosome in enumerate(population):
        if index == 0:
            next_generation_population.append(chromosome)
            continue
        mutation_position = []
        new_chromosome = chromosome.copy()
        for i in range(0,mutation_number):
            position = randint(0,n_feat-1)
            mutation_position.append(position)
        for pos in mutation_position:
            new_chromosome[pos] = not chromosome[pos]
        next_generation_population.append(new_chromosome)
    return next_generation_population


def generations(df,label,size,n_feat,n_parents,mutation_rate,n_gen,X_train, X_test, Y_train, Y_test):
    best_chromo= []
    best_score= []
    population_nextgen=initialization_of_population(size,n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print('Best score in generation',i+1,':',scores[0])
        pop_after_sel = selection(pop_after_fit,n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate,n_feat)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo,best_score


In [3]:
data_bc = pd.read_csv("sample_data/data.csv")
label_bc = data_bc["diagnosis"]
label_bc = np.where(label_bc == 'M',1,0)
data_bc.drop(["id","diagnosis","Unnamed: 32"],axis = 1,inplace = True)

print("Breast Cancer dataset:\n",data_bc.shape[0],"Records\n",data_bc.shape[1],"Features")
score1 = acc_score(data_bc,label_bc)
score1

Breast Cancer dataset:
 569 Records
 30 Features


ValueError: Length of values (1) does not match length of index (8)

In [4]:
logmodel = DecisionTreeClassifier(random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=100,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

Best score in generation 1 : 0.9790209790209791
Best score in generation 2 : 0.9790209790209791
Best score in generation 3 : 0.9790209790209791
Best score in generation 4 : 0.9790209790209791
Best score in generation 5 : 0.9790209790209791
Best score in generation 6 : 0.9790209790209791
Best score in generation 7 : 0.9790209790209791
Best score in generation 8 : 0.9790209790209791
Best score in generation 9 : 0.9790209790209791
Best score in generation 10 : 0.9790209790209791
Best score in generation 11 : 0.9790209790209791
Best score in generation 12 : 0.9790209790209791
Best score in generation 13 : 0.993006993006993
Best score in generation 14 : 0.993006993006993
Best score in generation 15 : 0.993006993006993
Best score in generation 16 : 0.993006993006993
Best score in generation 17 : 0.993006993006993
Best score in generation 18 : 0.993006993006993
Best score in generation 19 : 0.993006993006993
Best score in generation 20 : 0.993006993006993
Best score in generation 21 : 0.99300

In [6]:
X_train = pd.read_csv("Data/Xtrain.csv")
X_test = pd.read_csv("Data/Xtest.csv")
Y_train = pd.read_csv("Data/Ytrain.csv")
Y_test = pd.read_csv("Data/Ytest.csv")

In [7]:
data = pd.read_csv("Data/year_prediction.csv")
label = data['label']
print("Million Songs dataset:\n",data.shape[0],"Records\n",data.shape[1],"Features")
score1 = acc_score(data,label)

In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [9]:
Year = pd.read_csv('Data/year_prediction.csv')

In [9]:
scaler = StandardScaler()
X = Year.drop('label',axis=1)
y = Year['label']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.9,random_state=100)

col = X_train.columns
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train,columns=col)

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test,columns=col)

pca = PCA(random_state=42)
pca.fit(X_train)

print(len(pca.explained_variance_ratio_))
pca.explained_variance_ratio_

90


array([0.11509888, 0.07790812, 0.05908269, 0.04193843, 0.03750886,
       0.02785143, 0.02359574, 0.0229876 , 0.02094579, 0.02017323,
       0.01914213, 0.01860438, 0.01795661, 0.016911  , 0.0162782 ,
       0.01587638, 0.01514974, 0.01416422, 0.01405834, 0.01350418,
       0.01310675, 0.01260817, 0.01236738, 0.01179078, 0.01155292,
       0.01146151, 0.01109524, 0.01095227, 0.01047827, 0.01008891,
       0.00978087, 0.00944042, 0.00927768, 0.00894433, 0.00841613,
       0.00818835, 0.00804267, 0.00801281, 0.00765976, 0.00750439,
       0.00732581, 0.00719055, 0.00700093, 0.00680522, 0.00661245,
       0.00633611, 0.00628686, 0.00615171, 0.00598   , 0.0058407 ,
       0.0056721 , 0.00550313, 0.00536081, 0.00515497, 0.00501757,
       0.00499076, 0.00485278, 0.004733  , 0.00460899, 0.00437104,
       0.00424975, 0.0041122 , 0.00397558, 0.0039379 , 0.00371951,
       0.00365515, 0.0035191 , 0.00337947, 0.00325692, 0.00317536,
       0.00304071, 0.00291895, 0.00276109, 0.0027356 , 0.00262

In [10]:
pca = PCA(n_components = 60)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
col = ['C'+str(i) for i in range(1,61)]
X_train = pd.DataFrame(X_train,columns=col)
X_test = pd.DataFrame(X_test,columns=col)
X_test.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C51,C52,C53,C54,C55,C56,C57,C58,C59,C60
0,-1.172388,-0.74124,-0.277435,0.476964,-0.800808,0.409331,-0.386303,-1.425066,-0.932369,0.923172,...,-0.602887,-0.639125,-0.367147,-0.67751,-0.389927,1.141855,-0.092882,0.889122,0.238126,-0.058942
1,-3.608704,-1.31075,2.672325,0.949984,-0.805099,1.245864,0.807584,-0.216538,0.598077,-0.900713,...,0.589711,-0.160923,-0.207978,0.53674,1.035313,0.720426,-0.686481,-1.907624,-0.071587,-0.065726
2,-4.160324,-0.874602,1.506378,0.94767,0.782556,-0.171693,0.640408,0.063011,-1.094823,-0.09563,...,0.055905,0.312024,0.290411,0.127973,0.40199,-0.322713,-0.187898,-0.230951,-0.004757,0.222723
3,0.769099,3.817372,-1.903856,1.307942,-1.099492,-1.733354,-1.578549,-2.684431,-1.024785,0.337267,...,-0.262676,0.390822,0.064231,-0.777954,0.628404,0.379106,-0.232417,0.101876,-0.192402,0.248609
4,2.275427,-0.494207,-1.036636,-1.753217,1.648471,2.993101,-2.067733,-0.447315,-1.286245,-0.534801,...,-1.051183,-0.976042,-0.153428,0.366624,-0.359964,-0.405173,-1.669646,-0.696497,0.066673,-0.289318


In [11]:
#Repeating the process of PCA.
X = Year
y = Year.index
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,y,train_size=0.9,random_state=100)

col = X_train.columns
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train,columns=col)

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test,columns=col)

pca = PCA(random_state=42)
pca.fit(X_train)
PCA(random_state=42)
print(len(pca.explained_variance_ratio_))
pca.explained_variance_ratio_

91


array([0.11384196, 0.07722196, 0.05865936, 0.04155237, 0.03710999,
       0.02778923, 0.02335032, 0.02293495, 0.02072541, 0.01998647,
       0.0190138 , 0.01868783, 0.01798696, 0.01680205, 0.01617176,
       0.01570191, 0.01498354, 0.01426762, 0.01392366, 0.01348102,
       0.01306842, 0.01255977, 0.01224155, 0.01170776, 0.01152151,
       0.01134001, 0.01110468, 0.01084872, 0.01037461, 0.01005532,
       0.0098599 , 0.00967338, 0.00917677, 0.00892622, 0.00845293,
       0.00827207, 0.00808512, 0.00793597, 0.00790757, 0.00756017,
       0.00742179, 0.00719013, 0.00709918, 0.00690354, 0.00672136,
       0.00653581, 0.00625751, 0.00621697, 0.00608369, 0.0059141 ,
       0.00577645, 0.00560976, 0.00543653, 0.00528803, 0.00509445,
       0.00494517, 0.00490422, 0.00478843, 0.00467447, 0.0045583 ,
       0.00431396, 0.00419335, 0.00405916, 0.00393153, 0.00386074,
       0.00365757, 0.00355558, 0.00338721, 0.00334162, 0.00321945,
       0.00313649, 0.00300121, 0.00288684, 0.00272871, 0.00261

In [12]:
pca = PCA(n_components = 30)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
X_train

array([[-3.77458655e+00, -1.41323333e+00,  1.79527734e+00, ...,
        -5.40424516e-01, -1.69913786e-01, -5.14812946e-01],
       [-2.93494438e+00, -2.91735308e-03,  1.39549337e+00, ...,
        -2.05009711e-01, -6.59464952e-01,  2.58083763e-01],
       [-8.63449923e-01,  1.72943667e+00, -6.69276186e-01, ...,
         4.82769200e-01, -1.00674627e+00, -7.96614371e-01],
       ...,
       [-2.58981946e+00, -1.56939711e+00, -3.66440357e-01, ...,
        -4.86620433e-01,  4.44698852e-01, -1.11587063e+00],
       [ 5.42755969e+00,  4.77066739e+00,  2.95113071e+00, ...,
        -9.09076477e-01, -5.07914982e-01, -6.96726038e-01],
       [-2.20619187e+00,  1.71365093e+00,  1.66890261e-01, ...,
         2.61616530e+00, -9.64454766e-01, -1.83615583e-01]])

In [13]:
col = ['C'+str(i) for i in range(1,31)]
X_train = pd.DataFrame(X_train,columns=col)
X_test = pd.DataFrame(X_test,columns=col)
X_test.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C21,C22,C23,C24,C25,C26,C27,C28,C29,C30
0,-1.180841,-0.786433,-0.327938,0.507773,-0.78685,0.468191,-0.272145,-1.336588,-0.836484,1.059291,...,-0.134347,0.392284,0.171236,-0.646317,0.137496,1.198852,0.692963,-1.092008,-0.378141,-1.307032
1,-3.615221,-1.324727,2.61994,1.013497,-0.780523,1.369917,0.790352,-0.006034,0.621302,-0.782228,...,-1.471658,1.510211,-0.812621,0.432861,2.017201,1.156784,1.095542,-1.036161,0.188069,0.004265
2,-4.167192,-0.897154,1.457899,0.99642,0.794048,-0.096546,0.603117,0.204235,-1.069588,-0.018985,...,0.508906,-0.674811,0.379356,-0.763699,0.156451,0.250146,0.329905,-0.347709,0.184154,-0.172047
3,0.774913,3.81721,-1.915902,1.262655,-1.128943,-1.82764,-1.321795,-2.725236,-0.947193,0.48132,...,-0.01511,-0.070561,0.888992,-1.296927,0.616041,0.196477,0.614748,-0.500049,0.308264,-1.26043
4,2.270455,-0.529222,-1.062292,-1.712467,1.679097,2.997177,-1.951019,-0.830726,-1.321091,-0.572157,...,-0.539062,-0.818449,1.48424,-0.075016,0.166691,0.045292,0.273681,1.638964,0.10959,-1.551502


In [14]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(463810, 30)
(51535, 30)
(463810,)
(51535,)


In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
Y_pred = dt.predict(X_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


In [None]:
logmodel = DecisionTreeClassifier(random_state=0)
chromo_df,score=generations(Year,Year.index,size=80,n_feat=30,n_parents=64,mutation_rate=0.20,n_gen=100,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [10]:
Year = pd.read_csv('Data/year_prediction.csv')
Year = Year.sample(frac=0.05, random_state=42)
scaler = StandardScaler()
X = Year.drop('label', axis=1)
y = Year['label']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=100)

col = X_train.columns
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=col)

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=col)

pca = PCA(random_state=42)
pca.fit(X_train)

pca.explained_variance_ratio_
pca = PCA(n_components=60)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
col = ['C' + str(i) for i in range(1, 61)]
X_train = pd.DataFrame(X_train, columns=col)
X_test = pd.DataFrame(X_test, columns=col)
X_test.head()
#Repeating the process of PCA.
X = Year
y = Year.index
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=0.9, random_state=100)

col = X_train.columns
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=col)

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=col)

pca = PCA(random_state=42)
pca.fit(X_train)
PCA(random_state=42)
pca.explained_variance_ratio_
pca = PCA(n_components=30)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
X_train
col = ['C' + str(i) for i in range(1, 31)]
X_train = pd.DataFrame(X_train, columns=col)
X_test = pd.DataFrame(X_test, columns=col)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
Y_pred = dt.predict(X_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)