### Importing Libraries

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

### Creating Random Dataset with Categorical and Continous Variables

In [29]:
np.random.seed(0)
data = pd.DataFrame({
    'age': np.random.randint(18, 70, size=100),
    'income': np.random.randint(20000, 100000, size=100),
    'score': np.random.uniform(0, 1, size=100),
    'gender': np.random.choice(['Male', 'Female'], size=100),
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago'], size=100),
    'target': np.random.uniform(0, 1000, size=100)  # Continuous target variable for regression
})

### EDA

In [30]:
data.head()

Unnamed: 0,age,income,score,gender,city,target
0,62,57237,0.998847,Female,Chicago,699.622054
1,65,99701,0.149448,Male,Los Angeles,327.720402
2,18,28752,0.868126,Female,New York,756.778643
3,21,91331,0.162493,Female,Chicago,636.061055
4,21,70624,0.61556,Male,New York,240.020273


In [31]:
data.describe()

Unnamed: 0,age,income,score,target
count,100.0,100.0,100.0,100.0
mean,41.57,61144.98,0.470791,488.540837
std,15.537023,24686.931926,0.29133,307.798947
min,18.0,20469.0,0.011714,12.036223
25%,29.0,36709.25,0.212445,239.467016
50%,41.0,59264.0,0.45362,424.06062
75%,55.25,83122.75,0.718159,786.523697
max,69.0,99983.0,0.998847,992.011243


In [32]:
data.isnull().sum()

age       0
income    0
score     0
gender    0
city      0
target    0
dtype: int64

In [33]:
for col in data.columns:
    print(f'{col}  ->  {type(col)}')

age  ->  <class 'str'>
income  ->  <class 'str'>
score  ->  <class 'str'>
gender  ->  <class 'str'>
city  ->  <class 'str'>
target  ->  <class 'str'>


### Transforming categorical and continous columns using StandardScalar and OneHotEncoder Transformers

In [34]:
numeric_features = ['age', 'income', 'score']
categorical_features = ['gender', 'city']

### Combine Transformers using ColumnTransformer

In [35]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### Splitting the dataset

In [36]:
X = data.drop('target', axis=1)
y = data['target']

In [37]:
X

Unnamed: 0,age,income,score,gender,city
0,62,57237,0.998847,Female,Chicago
1,65,99701,0.149448,Male,Los Angeles
2,18,28752,0.868126,Female,New York
3,21,91331,0.162493,Female,Chicago
4,21,70624,0.615560,Male,New York
...,...,...,...,...,...
95,32,50752,0.716860,Male,Chicago
96,33,99464,0.396060,Male,Chicago
97,38,91892,0.565421,Male,Chicago
98,53,53930,0.183280,Male,New York


In [38]:
y

0     699.622054
1     327.720402
2     756.778643
3     636.061055
4     240.020273
         ...    
95    171.853099
96    449.291649
97    304.468407
98    839.189122
99    237.741826
Name: target, Length: 100, dtype: float64

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [40]:
X_train.shape

(80, 5)

### Transforming the dataset

In [41]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [42]:
X_train

array([[-0.33370406, -1.52357347, -1.06562026,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [ 0.85977871, -1.14030493, -0.95844338,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-1.33874219,  1.19152386, -1.14834894,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-1.46437196,  0.06245833,  0.48000109,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.92259359,  1.22370118,  0.86595873,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ],
       [-0.96185289, -1.35906239, -0.82768535,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ],
       [-0.96185289,  0.5400036 ,  1.24454579,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ],
       [-0.08244453,  0.20256459, -1.20994209,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [ 0.67133406, -1.23603146

In [43]:
X_test

array([[-1.02466778, -0.40107228, -0.93391212,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.83622313, -1.47560953, -0.23312628,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-1.52718684, -1.32865703,  1.31477011,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 1.6135573 ,  1.07642651, -1.62941025,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.85977871, -0.99750046, -1.06756477,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.73414894, -0.86903281,  0.09043488,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-1.46437196,  1.49142937,  1.27175581,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ],
       [ 1.04822336, -0.29221709, -0.76071527,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.83622313,  1.10042862

### Defining models

In [44]:
models = {
    "Linear Regression": LinearRegression(),
    "SVR": SVR(),
    "Random Forest": RandomForestRegressor()
}

### Training the models

In [45]:
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {"MSE": mse, "R2": r2}

### Results

In [46]:
results

{'Linear Regression': {'MSE': 76611.17664551477, 'R2': 0.09344064194424617},
 'SVR': {'MSE': 92964.72458529609, 'R2': -0.10007501166358956},
 'Random Forest': {'MSE': 119559.00837928147, 'R2': -0.4147718731382972}}