In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
import numpy as np
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.metrics import r2_score

In [8]:
data = pd.read_csv('wine_agg.csv')


In [9]:
y = data['LogAuctionIndex']
x = data.drop(columns = 'LogAuctionIndex')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
#2A
lasso_cv = LassoCV(cv = 5, random_state=42).fit(x_train, y_train)
best_alpha = lasso_cv.alpha_
print(f"Best alpha: {best_alpha}")

Best alpha: 0.03273531874228395


In [13]:
lasso = Lasso(alpha=best_alpha)
lasso.fit(x_train, y_train)
y_pred = lasso.predict(x_test)

In [14]:
#OSR2
numerator = np.sum((y_test - y_pred) ** 2)
denominator = np.sum((y_test - y_train.mean())**2)
osr2 = 1 - numerator/denominator
print(f"OSR2: {osr2}")

OSR2: 0.3862533319966974


In [15]:
coeffs = pd.Series(lasso.coef_, index=x.columns)
zero_coeffs = coeffs[coeffs == 0]
print(f"Number of zero coefficients: {len(zero_coeffs)}")
print(zero_coeffs.index.tolist())

Number of zero coefficients: 2
['Year', 'USAlcConsump']


In [17]:
#2B
alphas = np.logspace(-4, 4, 100)
ridge_cv = RidgeCV(alphas=alphas, store_cv_values=True).fit(x_train, y_train)
best_beta = ridge_cv.alpha_
print(f"Best beta: {best_beta}")

Best beta: 1.592282793341094




In [18]:
ridge = Ridge(alpha=best_beta)
ridge.fit(x_train, y_train)
y_pred = ridge.predict(x_test)


In [19]:
numerator = np.sum((y_test - y_pred) ** 2)
denominator = np.sum((y_test - y_train.mean())**2)
osr2 = 1 - numerator/denominator
print(f"OSR2: {osr2}")

OSR2: 0.32856394760953944


In [20]:
coeffs = pd.Series(ridge.coef_, index=x.columns)
zero_coeffs = coeffs[coeffs == 0]
print(f"Number of zero coefficients: {len(zero_coeffs)}")
print(zero_coeffs.index.tolist())

Number of zero coefficients: 0
[]


In [7]:
#4
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline




In [8]:
x_train = np.load("mnist12x12_trainfeats.npy")
y_train = np.load("mnist12x12_trainlabels.npy")
x_test = np.load("mnist12x12_testfeats.npy")
y_test = np.load("mnist12x12_testlabels.npy")

In [9]:
scaler = StandardScaler()

In [11]:
#linear svm
linear_pipeline = Pipeline([('scaler', StandardScaler()), ('svc', SVC(kernel='linear'))])

linear_scores = cross_val_score(linear_pipeline, x_train, y_train, cv=5, scoring='accuracy')
print(f"Linear SVM cross-validation accuracy: {linear_scores.mean()}")

Linear SVM cross-validation accuracy: 0.9392666666666667


In [18]:

rbf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf'))
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 0.01, 0.001]
}

grid = GridSearchCV(estimator=rbf_pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
grid.fit(x_train, y_train)
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy: {:.4f}".format(grid.best_score_))

Best Parameters: {'svc__C': 10, 'svc__gamma': 'scale'}
Best Cross-Validation Accuracy: 0.9741


In [19]:
linear_pipeline.fit(x_train, y_train)
rbf_best = grid.best_estimator_
rbf_best.fit(x_train, y_train)

In [21]:
linear_test_acc = accuracy_score(y_test, linear_pipeline.predict(x_test))
rbf_test_acc = accuracy_score(y_test, rbf_best.predict(x_test))

print(f"Linear SVM Test Accuracy: {linear_test_acc:.4f}")
print(f"RBF SVM Test Accuracy: {rbf_test_acc:.4f}")

Linear SVM Test Accuracy: 0.9452
RBF SVM Test Accuracy: 0.9774


In [25]:
#Q5B
import pandas as pd
circles = pd.read_csv('circles_data.csv')
circles

Unnamed: 0,X1,X2,y
0,0.490453,0.192749,1.0
1,-0.879728,-0.375273,0.0
2,0.409955,0.295220,1.0
3,0.291393,-0.391432,1.0
4,-0.493470,0.246672,1.0
...,...,...,...
995,0.245149,-0.377907,1.0
996,-0.287007,0.453802,1.0
997,-0.571187,0.797375,0.0
998,-0.073063,1.103889,0.0


In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


In [32]:
x = circles[['X1', 'X2']].values
y = circles['y'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [33]:
#svm with rbf kernel
svm_rbf = Pipeline([('scaler', StandardScaler()), ('svc', SVC(kernel='rbf', C=1, gamma='scale'))])
svm_rbf.fit(x_train, y_train)
y_pred_rbf = svm_rbf.predict(x_test)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)
print(f"SVM with RBF kernel accuracy: {accuracy_rbf:.4f}")

SVM with RBF kernel accuracy: 1.0000


In [34]:
#LDA
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
y_pred_lda = lda.predict(x_test)
accuracy_lda = accuracy_score(y_test, y_pred_lda)
print(f"LDA accuracy: {accuracy_lda:.4f}")

LDA accuracy: 0.4800


In [86]:
#Q7
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
train = pd.read_csv('alzheimers_train.csv')
test = pd.read_csv('alzheimers_test.csv')


In [71]:
x_train = train.drop(columns = 'Diagnosis')
y_train = train['Diagnosis']
x_test = test.drop(columns = 'Diagnosis')
y_test = test['Diagnosis']

In [72]:
numerical_features = x_train.select_dtypes(include= 'number').columns
categorical_features = x_train.select_dtypes(include= 'object').columns

In [73]:
numeric_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
categorical_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])


In [74]:
#Random Forest
rf_pipeline = Pipeline([('preprocessor', preprocessor), ('Classifier', RandomForestClassifier(n_estimators = 100, random_state=42))])

cv_scores = cross_val_score(rf_pipeline, x_train, y_train, cv=5, scoring='accuracy')
print(f"Random Forest cross-validation accuracy: {cv_scores.mean():.4f}")

Random Forest cross-validation accuracy: 0.8214


In [75]:
rf_pipeline.fit(x_train, y_train)

y_pred = rf_pipeline.predict(x_test)
print(f"Random Forest test accuracy: {accuracy_score(y_test, y_pred):.4f}")

Random Forest test accuracy: 0.7667


In [76]:
#Logistic Regression
logreg_pipeline = Pipeline([('preprocessor', preprocessor), ('Classifier', LogisticRegression(max_iter=1000))])

cv_scores = cross_val_score(logreg_pipeline, x_train, y_train, cv=5, scoring='accuracy')
print(f"Logistic Regression cross-validation accuracy: {cv_scores.mean():.4f}")

Logistic Regression cross-validation accuracy: 0.8500


In [77]:
logreg_pipeline.fit(x_train, y_train)
y_pred = logreg_pipeline.predict(x_test)
print(f"Logistic Regression test accuracy: {accuracy_score(y_test, y_pred):.4f}")

Logistic Regression test accuracy: 0.7833


In [82]:
#Gradient Boosting

gb_pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', HistGradientBoostingClassifier(random_state=42))])

cv_scores = cross_val_score(gb_pipeline, x_train, y_train, cv=5, scoring='accuracy')
print(f"Gradient Boosting cross-validation accuracy: {cv_scores.mean():.4f}")

Gradient Boosting cross-validation accuracy: 0.8571


In [83]:
param_grid = {
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__max_iter': [100, 200]
}



grid = GridSearchCV(estimator=gb_pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
grid.fit(x_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy: {:.4f}".format(grid.best_score_))

Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__max_iter': 100}
Best Cross-Validation Accuracy: 0.8571


In [85]:
test_acc = grid.best_estimator_.score(x_test, y_test)
print(f"Gradient Boosting test accuracy: {test_acc:.4f}")

Gradient Boosting test accuracy: 0.9000
