In [77]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd

In [78]:
dataset=pd.read_csv('Color_classification_lab_CIE.csv')
dataset.head(5)

Unnamed: 0,S. No.,Power,Scanning speed,Pulse width,Frequency,Color,R,G,B,L,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,1,9,1000,5,50,No Color,0,0,0,0.0,...,,,,,,,,,,
1,2,9,1000,5,100,No Color,0,0,0,0.0,...,,,,,,,,,,
2,3,9,1000,5,150,No Color,0,0,0,0.0,...,,,,,,,,,,
3,4,9,1000,5,200,No Color,0,0,0,0.0,...,,,,,,,,,,
4,5,9,1000,5,250,No Color,0,0,0,0.0,...,,,,,,,,,,


#### Keeping only columns Power, Scanning Speed, Pulse width,Frequency and Colour. Here Colour is the response variable

In [79]:
dataset=dataset.iloc[:,1:6]
dataset

Unnamed: 0,Power,Scanning speed,Pulse width,Frequency,Color
0,9,1000,5,50,No Color
1,9,1000,5,100,No Color
2,9,1000,5,150,No Color
3,9,1000,5,200,No Color
4,9,1000,5,250,No Color
...,...,...,...,...,...
295,9,1056,20,300,Hurricane Brown
296,9,1167,20,300,Americano Brown
297,9,1278,20,300,Squirrel Brown
298,9,1389,20,300,Squirrel Brown


In [80]:
#Check for null values
dataset.isnull().sum()

Power              0
Scanning speed     0
Pulse width        0
Frequency          0
Color              0
dtype: int64

In [81]:
# Encoding the colour columns
dataset['Color'] = dataset['Color'].apply(lambda x: 0 if x == 'No Color' else 1)
dataset

Unnamed: 0,Power,Scanning speed,Pulse width,Frequency,Color
0,9,1000,5,50,0
1,9,1000,5,100,0
2,9,1000,5,150,0
3,9,1000,5,200,0
4,9,1000,5,250,0
...,...,...,...,...,...
295,9,1056,20,300,1
296,9,1167,20,300,1
297,9,1278,20,300,1
298,9,1389,20,300,1


In [82]:
dataset.columns=['Power','SS','PW','Frequency','Color']
dataset

Unnamed: 0,Power,SS,PW,Frequency,Color
0,9,1000,5,50,0
1,9,1000,5,100,0
2,9,1000,5,150,0
3,9,1000,5,200,0
4,9,1000,5,250,0
...,...,...,...,...,...
295,9,1056,20,300,1
296,9,1167,20,300,1
297,9,1278,20,300,1
298,9,1389,20,300,1


In [83]:
from imblearn.combine import SMOTETomek

# Resampling the minority class. The strategy can be changed as required.
smt = SMOTETomek(random_state=42,sampling_strategy='not majority')
# Fit the model to generate the data.
X,Y = smt.fit_resample(dataset.iloc[:,:-1], dataset.iloc[:,-1])
dataset

Unnamed: 0,Power,SS,PW,Frequency,Color
0,9,1000,5,50,0
1,9,1000,5,100,0
2,9,1000,5,150,0
3,9,1000,5,200,0
4,9,1000,5,250,0
...,...,...,...,...,...
295,9,1056,20,300,1
296,9,1167,20,300,1
297,9,1278,20,300,1
298,9,1389,20,300,1


In [84]:
# Print Single Way ANOVA for individual variable
def an(A,rows,cols):
    # Calculate the sum of rows
    Sum_of_rows=np.sum(A,axis=1)
    # Calculate the sum of Columns
    Sum_of_cols=np.sum(A,axis=0)
    Total_sum_of_elements=np.sum(Sum_of_cols,axis=0)
    # Calculate Sum of Sqauares total
    SST=0
    for i in range(rows):
       for j in range(cols):
           SST+=A[i,j]**2

    print(SST)
    SST-=Total_sum_of_elements**2/(rows*cols)
    print("Sum of total :",SST)
    # Calculation on treatments
    SStreatments=0
    print(Sum_of_rows.shape)
    for i in range(Sum_of_rows.shape[0]):
        SStreatments+=Sum_of_rows[i]**2
    SStreatments/=Sum_of_cols.shape[0]
    SStreatments-=Total_sum_of_elements**2/(rows*cols)
    print("Sum of Treatments :",SStreatments)
    # Error
    SSerror=SST-SStreatments
    print("Error",SSerror)

    # Calculation of degrees of Freedom
    dft=Sum_of_rows.shape[0]-1
    dfw=rows*cols-dft-1
    dftotal=rows*cols-1
    # Calculation of Mean squares
    mst=(SStreatments/dft)
    mse=SSerror/dfw
    # calculate the F statistic and p-value
    # F-value
    Ftreat=mst/mse
    p_val = stats.f.sf(Ftreat, dft, dfw)
    # create the ANOVA table
    anova_table = pd.DataFrame({
    'Source of Variation': ['Between Treatments', 'Error (Within Treatments)', 'Total'],
    'Sum of Squares': [SStreatments, SSerror, SST],
    'Degrees of Freedom': [dft, dfw, dftotal],
    'Mean Square': [mst, mse, np.nan],
    'F value': [Ftreat, np.nan, np.nan],
    'p-value': [p_val, np.nan, np.nan]
    })

    # set the index to the source column
    anova_table.set_index('Source of Variation', inplace=True)

    # display the ANOVA table
    print(anova_table)

### Single ANOVA on Frequency

In [85]:
# Variation of Frequency
freq=dataset['Frequency'].unique()
print(freq)
print("Number of datapoints :",dataset['Frequency'].value_counts())

[ 50 100 150 200 250 300 350 400 450 500]
Number of datapoints : 300    210
50      10
100     10
150     10
200     10
250     10
350     10
400     10
450     10
500     10
Name: Frequency, dtype: int64


In [86]:
A=[]
for i in freq:
    s=[]
    s=dataset[dataset['Frequency']==i]['Color'][0:10]
    #print(len(s))
    A.append(s)
dataset['Frequency']

0       50
1      100
2      150
3      200
4      250
      ... 
295    300
296    300
297    300
298    300
299    300
Name: Frequency, Length: 300, dtype: int64

In [87]:
A=np.array(A)
an(A,len(freq),10)

90
Sum of total : 9.0
(10,)
Sum of Treatments : 0.7999999999999972
Error 8.200000000000003
                           Sum of Squares  Degrees of Freedom  Mean Square  \
Source of Variation                                                          
Between Treatments                    0.8                   9     0.088889   
Error (Within Treatments)             8.2                  90     0.091111   
Total                                 9.0                  99          NaN   

                           F value   p-value  
Source of Variation                           
Between Treatments         0.97561  0.465583  
Error (Within Treatments)      NaN       NaN  
Total                          NaN       NaN  


### Single ANOVA on Pulse width

In [88]:
# Variation of Frequency
pulse=dataset['PW'].unique()
print(pulse)
print("Number of datapoints :",dataset['PW'].value_counts())

[ 5 10 15 20 25 30 35 40 45 50  2  4  6  8 12 14 16 18]
Number of datapoints : 5     110
10     21
20     20
2      10
16     10
14     10
12     10
8      10
6      10
4      10
50     10
45     10
40     10
35     10
30     10
25     10
18     10
15      9
Name: PW, dtype: int64


In [89]:
# Minimum datapoints found 9
A=[]
for i in pulse:
    s=[]
    s=dataset[dataset['PW']==i]['Color'][0:9]
    #print(len(s))
    A.append(s)

In [90]:
A=np.array(A)
an(A,len(pulse),9)

153
Sum of total : 8.5
(18,)
Sum of Treatments : 5.388888888888886
Error 3.1111111111111143
                           Sum of Squares  Degrees of Freedom  Mean Square  \
Source of Variation                                                          
Between Treatments               5.388889                  17     0.316993   
Error (Within Treatments)        3.111111                 144     0.021605   
Total                            8.500000                 161          NaN   

                             F value       p-value  
Source of Variation                                 
Between Treatments         14.672269  1.207762e-23  
Error (Within Treatments)        NaN           NaN  
Total                            NaN           NaN  


In [91]:
import statsmodels.formula.api as smf
# Fit a regression model with main effects and two-factor interactions
model = smf.ols(formula='Color ~ Power + SS + PW + Frequency + Power:SS + Power:PW + Power:Frequency+SS:PW+SS:Frequency+PW:Frequency+Power:SS:PW+SS:PW:Frequency+SS:Power:Frequency+PW:Power:Frequency', data=dataset).fit()

# Print the model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Color   R-squared:                       0.334
Model:                            OLS   Adj. R-squared:                  0.318
Method:                 Least Squares   F-statistic:                     20.90
Date:                Sun, 30 Apr 2023   Prob (F-statistic):           9.30e-23
Time:                        22:07:55   Log-Likelihood:                -41.048
No. Observations:                 300   AIC:                             98.10
Df Residuals:                     292   BIC:                             127.7
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept           4.879e-08   6.71

In [92]:
import statsmodels.api as sm
#model = sm.ols(formula='Color ~ Power + SS + PW + Frequency + Power:SS + Power:PW + Power:Frequency+SS:PW+SS:Frequency+PW:Frequency', data=dataset).fit()

# Print the model summary
#print(model.summary())
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

                          sum_sq     df             F        PR(>F)
Power               5.238184e-03    1.0  6.623258e-02  7.970845e-01
SS                  2.825095e-08    1.0  3.572103e-07  9.995235e-01
PW                  5.786158e-11    1.0  7.316126e-10  9.999784e-01
Frequency          -8.105399e-12    1.0 -1.024862e-10  1.000000e+00
Power:SS            5.917186e+00    1.0  7.481800e+01  3.504414e-16
Power:PW            9.623778e+00    1.0  1.216849e+02  6.970272e-24
Power:Frequency     3.764591e+00    1.0  4.760020e+01  3.247131e-11
SS:PW               8.019065e-01    1.0  1.013946e+01  1.608161e-03
SS:Frequency        7.989262e-03    1.0  1.010177e-01  7.508389e-01
PW:Frequency        2.880683e-01    1.0  3.642390e+00  5.730642e-02
Power:SS:PW         2.811692e+00    1.0  3.555156e+01  7.157513e-09
SS:PW:Frequency     1.306402e+00    1.0  1.651840e+01  6.199484e-05
SS:Power:Frequency  7.092561e-01    1.0  8.967966e+00  2.982840e-03
PW:Power:Frequency  6.995556e-02    1.0  8.84531

In [93]:
# 1. Logistic regression with L2 regularizer (Hyperparameter: regularization constant C)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import fbeta_score
# Setting seed value
seed=200
from sklearn.linear_model import LogisticRegression
# Define parameter grid to search over
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

# Perform 5-fold cross-validation with stratified sampling to balance all the classes in the cross validation set
cv = StratifiedKFold(n_splits=5)

# Create a logistic regression classifier with L2 regularization
lr = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, class_weight='balanced')

# Perform grid search over parameter grid using cross-validation
grid_search = GridSearchCV(lr, param_grid, cv=cv, scoring='accuracy')

# Fit the grid search to the training data
grid_search.fit(X, Y)
lr1=LogisticRegression(penalty='l2', C=0.01,solver='lbfgs', max_iter=1000, class_weight='balanced')
lr1.fit(X,Y)
y_train=Y
y_test=Y

y_pred_train=grid_search.predict(X)
y_pred_test=grid_search.predict(X)

print("Coefficients of Logistic regression :",lr1.coef_)

# Print the best hyperparameters and accuracy score,precision,recall,specificity and sensitivity
print("Best hyperparameters: ", grid_search.best_params_)

print("\nAccuracy")
print("Training Set: ", grid_search.score(X, Y))
print("Test Set: ", grid_search.score(X, Y))

print("\nPrecision")
print("Training Set: ", precision_score(y_train,y_pred_train, average='macro'))
print("Test Set: ", precision_score(y_test,y_pred_test, average='macro'))

print("\nRecall")
print("Training Set: ", recall_score(y_train,y_pred_train, average='macro'))
print("Test Set: ", recall_score(y_test,y_pred_test, average='macro'))

tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train,y_pred_train).ravel()
specificity_train = tn_train / (tn_train+fp_train)
sensitivity_train = tp_train / (tp_train+fn_train)

tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test,y_pred_test).ravel()
specificity_test = tn_test / (tn_test+fp_test)
sensitivity_test = tp_test / (tp_test+fn_test)

print('\nSpecificity')
print("Training Set: ", specificity_train)
print("Test Set: ", specificity_test)

print('\nSensitivity')
print("Training Set: ", sensitivity_train)
print("Test Set: ", sensitivity_test)

print('\nF1_score')
print("Training Set: ", fbeta_score(y_train, y_pred_train, average='micro', beta=1))
print("Test Set: ",  fbeta_score(y_test, y_pred_test, average='micro', beta=1))

Coefficients of Logistic regression : [[0.5424988  0.00147962 0.36225149 0.01711713]]
Best hyperparameters:  {'C': 0.01}

Accuracy
Training Set:  0.9382239382239382
Test Set:  0.9382239382239382

Precision
Training Set:  0.9408527166659169
Test Set:  0.9408527166659169

Recall
Training Set:  0.9382239382239382
Test Set:  0.9382239382239382

Specificity
Training Set:  0.9768339768339769
Test Set:  0.9768339768339769

Sensitivity
Training Set:  0.8996138996138996
Test Set:  0.8996138996138996

F1_score
Training Set:  0.9382239382239382
Test Set:  0.9382239382239382
