In [1]:
import pandas as pd

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv("train.csv")

# Print the DataFrame (optional)
print(df.head())


   id  Gender  Age  Driving_License  Region_Code  Previously_Insured  \
0   1    Male   44                1         28.0                   0   
1   2    Male   76                1          3.0                   0   
2   3    Male   47                1         28.0                   0   
3   4    Male   21                1         11.0                   1   
4   5  Female   29                1         41.0                   1   

  Vehicle_Age Vehicle_Damage  Annual_Premium  Policy_Sales_Channel  Vintage  \
0   > 2 Years            Yes         40454.0                  26.0      217   
1    1-2 Year             No         33536.0                  26.0      183   
2   > 2 Years            Yes         38294.0                  26.0       27   
3    < 1 Year             No         28619.0                 152.0      203   
4    < 1 Year             No         27496.0                 152.0       39   

   Response  
0         1  
1         0  
2         1  
3         0  
4         0  


In [2]:
import pandas as pd

# Assuming your DataFrame is loaded as 'df' (replace with your actual DataFrame name)

# One-hot encode the categorical columns
df = pd.get_dummies(df, columns=['Gender', 'Vehicle_Damage','Vehicle_Age'], drop_first=True)

# Print the DataFrame
print(df.head())


   id  Age  Driving_License  Region_Code  Previously_Insured  Annual_Premium  \
0   1   44                1         28.0                   0         40454.0   
1   2   76                1          3.0                   0         33536.0   
2   3   47                1         28.0                   0         38294.0   
3   4   21                1         11.0                   1         28619.0   
4   5   29                1         41.0                   1         27496.0   

   Policy_Sales_Channel  Vintage  Response  Gender_Male  Vehicle_Damage_Yes  \
0                  26.0      217         1         True                True   
1                  26.0      183         0         True               False   
2                  26.0       27         1         True                True   
3                 152.0      203         0         True               False   
4                 152.0       39         0        False               False   

   Vehicle_Age_< 1 Year  Vehicle_Age_> 2 Yea

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Separate features and target variable
X = df.drop('Response', axis=1)
y = df['Response']

# Standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: Accuracy - {accuracy:.4f}")


Logistic Regression: Accuracy - 0.8746
Naive Bayes: Accuracy - 0.7130
SVM: Accuracy - 0.8746
Decision Tree: Accuracy - 0.8131
KNN: Accuracy - 0.8500
Random Forest: Accuracy - 0.8688
XGBoost: Accuracy - 0.8684
Gradient Boosting: Accuracy - 0.8741
AdaBoost: Accuracy - 0.8742


In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Separate features and target variable
X = df.drop('Response', axis=1)
y = df['Response']

# Standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define top 3 models and parameter grids
top_models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier()
}

param_grids = {
    "Logistic Regression": {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs']
    },
    "SVM": {
        'C': [0.01, 0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': [0.01, 0.1, 1]
    },
    "Random Forest": {
        'n_estimators': [100, 200, 300],
        'max_depth': [4, 6, 8],
        'min_samples_split': [2, 5, 10]
    }
}

# Perform GridSearchCV for each model
for name, model in top_models.items():
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_accuracy = grid_search.best_score_
    print(f"{name}: Best Parameters - {best_params}, Best Accuracy - {best_accuracy:.4f}")


Logistic Regression: Best Parameters - {'C': 0.001, 'solver': 'liblinear'}, Best Accuracy - 0.8780
