# Support Vector Machines
## Instructions
1. Use any dataset from https://archive.ics.uci.edu/datasets?Task=Clustering&skip=0&take=10&sort=desc&orderBy=NumHits&search=&Area=Biology
2. Each student should have a unique dataset otherwise no points will be given, so you need to discuss with your classmate.

**TASK: Take 10 moderately sized subsamples your dataset and create a correlation plot for each subsample**

**TASK (Classification): If your problem is a classification problem visually check if the target variable is imbalanced**
**TASK (Regression): If your problem is a regression problem visually the distribution of your target variabe**

**TASK: Take 10 moderately sized subsamples and create a pairplot of the feature variable in relation to the target variable**

**TASK: Create a clustermap with seaborn to explore the relationships between variables.**

**TASK: Perform SVM modelling and check the performance of your model, improve the performance of your model using grdisearch**

In [5]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from ucimlrepo import fetch_ucirepo

In [None]:
url = 'https://archive.ics.uci.edu/static/public/401/gene+expression+cancer+rna+seq.zip'

df = pd.read_csv(url)

df.head()

In [None]:
subsamples = []
for i in range(10):
    subsample = df.sample(n=100)
    subsamples.append(subsample)

for i, subsample in enumerate(subsamples):
    plt.figure(figsize=(10, 8))
    sns.heatmap(subsample.corr(), annot=True)
    plt.title(f'Correlation Plot for Subsample {i+1}')
    plt.show()

In [None]:
if 'label' in df.columns:
    # Classification problem
    print('Target variable distribution:')
    print(df['label'].value_counts())

    # Plot target variable distribution
    plt.figure(figsize=(8, 6))
    df['label'].value_counts().plot(kind='bar')
    plt.title('Target Variable Distribution')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.show()
else:
    # Regression problem
    print('Target variable distribution:')
    print(df['target'].describe())

    # Plot target variable distribution
    plt.figure(figsize=(8, 6))
    df['target'].hist(bins=30)
    plt.title('Target Variable Distribution')
    plt.xlabel('Value')
    plt.ylabel('Count')
    plt.show()

In [None]:
for i, subsample in enumerate(subsamples):
    plt.figure(figsize=(12, 10))
    sns.pairplot(subsample, hue='label', diag_kind='kde')
    plt.suptitle(f'Pairplot for Subsample {i+1}')
    plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.clustermap(df.corr())
plt.title('Clustermap of Variable Correlations')
plt.show()

In [None]:
X = df.drop('', axis=1)
y = df['']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
svm = SVC()
svm.fit(X_train, y_train)

train_score = svm.score(X_train, y_train)
test_score = svm.score(X_test, y_test)
print(f'Train Accuracy: {train_score:.2f}')
print(f'Test Accuracy: {test_score:.2f}')

param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}
grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(X_train, y_train)

train_score = grid_search.score(X_train, y_train)
test_score = grid_search.score(X_test, y_test)
print(f'Optimized Train Accuracy: {train_score:.2f}')
print(f'Optimized Test Accuracy: {test_score:.2f}')