<a href="https://colab.research.google.com/github/karim-mammadov/Kaggle-Datasets-MyMLProject/blob/main/Laptop_Prices_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download anubhavgoyal10/laptop-prices-dataset

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('/content/laptop-prices-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('/content/laptopPrice.csv')
df

DATA CLEANING AND DATA VISUALIZATION

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
    print(f"{col} outlier sayi: {len(outliers)}")

In [None]:
def remove_outliers(df):
  numeric_df = df.select_dtypes(include=['float64', 'int64'])
  for column in numeric_df.columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

  return df

In [None]:
plt.figure(figsize=(8, 5))
os_order = df.groupby('os')['Price'].mean().sort_values(ascending=False).index
sns.barplot(data=df, x='os', y='Price', hue='os', order=os_order, palette='Set2', legend=False)
plt.title('Average Price by Operating System', fontsize=14)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show();

In [None]:
plt.figure(figsize=(7, 5))
sns.boxplot(data=df, x='ram_type', y='Price', hue='ram_type', palette='cubehelix', legend=False)
plt.title('Price Distribution by RAM Type', fontsize=14)
plt.tight_layout()
plt.show();

In [None]:
plt.figure(figsize=(10, 6))

unique_brands = df['brand'].unique()
num_brands = len(unique_brands)
palette = sns.color_palette("Set2", num_brands)
brand_order = df.groupby('brand')['Price'].mean().sort_values(ascending=False).index

sns.barplot(x='brand', y='Price', hue='brand', data=df, palette=palette,
            order=brand_order, dodge=False, legend=False)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.title("Brand-wise Average Price", fontsize=16, fontweight='bold')
plt.xlabel("Brand", fontsize=12)
plt.ylabel("Average Price", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show();

In [None]:
df=remove_outliers(df)

In [None]:
df.head()

In [None]:
df['hdd']=df['hdd'].str.extract('(\d+)').astype('int')

In [None]:
df['hdd'].value_counts()

In [None]:
df['ram_gb']=df['ram_gb'].str.extract('(\d+)').astype('int')

In [None]:
df['ram_gb'].value_counts()

In [None]:
df['ssd']=df['ssd'].str.extract('(\d+)').astype('int')

In [None]:
df['ssd'].value_counts()

In [None]:
df['os_bit']=df['os_bit'].str.extract('(\d+)').astype('int')

In [None]:
df['os_bit'].value_counts()

In [None]:
df['graphic_card_gb']=df['graphic_card_gb'].str.extract('(\d+)').astype('int')

In [None]:
df['graphic_card_gb'].value_counts()

In [None]:
df['rating']=df['rating'].str.extract('(\d+)').astype('int')

In [None]:
df['rating'].value_counts()

In [None]:
df['processor_gnrtn'] = df['processor_gnrtn'].str.replace('th', '', regex=False)

In [None]:
df['processor_gnrtn'].dtype

In [None]:
df['processor_gnrtn'].value_counts()

In [159]:
# plt.pie(df['processor_gnrtn'].value_counts(), labels=df['processor_gnrtn'].unique(), autopct='%1.1f%%')
# plt.show();

In [None]:
df.head()

In [None]:
df.corr(numeric_only=True)

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True)

Build a Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
X=df.drop('Price', axis=1)
y=df['Price'].copy()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
num_feature= X_train.select_dtypes(include=[np.number]).columns
cat_feature= X_train.select_dtypes(exclude=[np.number]).columns

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_feature),
    ('cat', cat_pipeline, cat_feature)
])

estimator = LinearRegression()

full_pipeline = Pipeline([
    ('preprocessing', transformer),
    ('estimator', estimator)
])

In [None]:
full_pipeline.fit(X_train, y_train)

In [None]:
full_pipeline.score(X_train, y_train), full_pipeline.score(X_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(full_pipeline, X_train, y_train, cv=5)

Random Forest Regressor

In [None]:
estimator_1=RandomForestRegressor(random_state=42, n_estimators=200, min_samples_split=3,min_samples_leaf=2,max_depth=10)

full_pipeline_1 = Pipeline([
    ('preprocessing', transformer),
    ('estimator_1', estimator_1)
])

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
full_pipeline_1.score(X_train, y_train), full_pipeline_1.score(X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
params = {
    'estimator_1__n_estimators': [50, 100, 200],
    'estimator_1__max_depth': [10, 20, 30],
    'estimator_1__min_samples_split': [2, 3,5],
    'estimator_1__min_samples_leaf': [1, 2, 4]
}

In [None]:
grid_search = GridSearchCV(full_pipeline_1, params, cv=5)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
y_pred2 = full_pipeline_1.predict(X_test)

In [None]:
y_pred2[:10]
y_test[:10]

In [None]:
comparison = pd.DataFrame({
    'Actual Price': y_test[:10].values,
    'Predicted Price': y_pred[:10]
})

print(comparison)

In [None]:
r2_score(y_test, y_pred2)

In [None]:
mse = mean_squared_error(y_test, y_pred2)
mse

In [None]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred2)
print(mae)

In [None]:
sns.kdeplot(y_test, color='purple', label='Actual')
sns.kdeplot(y_pred2, color='black', label='Predicted')
plt.legend()
plt.show();

Decision Tree Regressor

In [None]:
estimator_2=DecisionTreeRegressor(random_state=42,min_samples_split=3,min_samples_leaf=3,max_depth=10)

full_pipeline_2 = Pipeline([
    ('preprocessing', transformer),
    ('estimator_2', estimator_2)
])

In [None]:
# params = {
#     'estimator_2__max_depth': [5, 10, 15, None],
#     'estimator_2__min_samples_split': [2, 3, 5],
#     'estimator_2__min_samples_leaf': [1, 2, 3],
# }

# grid_search = GridSearchCV(full_pipeline_2, param_grid=params, cv=5, scoring='r2')
# grid_search.fit(X_train, y_train)

# print("Ən yaxşı parametrlər:", grid_search.best_params_)
# print("Ən yaxşı CV score:", grid_search.best_score_)

In [None]:
full_pipeline_2.fit(X_train, y_train)

In [None]:
y_pred3=full_pipeline_2.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred3)
mae

In [None]:
mse = mean_squared_error(y_test, y_pred3)
mse

In [None]:
r2_score(y_test, y_pred3)

In [None]:
sns.kdeplot(y_test, color='purple', label='Actual')
sns.kdeplot(y_pred3, color='black', label='Predicted')
plt.legend()
plt.show();

Ridge and Lasso Model

In [None]:
from sklearn.linear_model import Ridge, Lasso
estimator_3 = Ridge(max_iter=10000, alpha=1.0)
estimator_4 = Lasso(max_iter=10000, alpha=0.1)

full_pipeline_3 = Pipeline([
    ('preprocessing', transformer),
    ('estimator_3', estimator_3)
])
# params = {
#     'estimator_3__alpha': [0.0001, 0.001, 0.005, 0.01, 0.1, 1]
# }
# grid = GridSearchCV(full_pipeline_3, params, cv=5)
# grid.fit(X_train, y_train)

# print(grid.best_params_)

In [None]:
full_pipeline_3.fit(X_train, y_train)

In [None]:
full_pipeline_3.score(X_train, y_train), full_pipeline_3.score(X_test, y_test)

In [None]:
estimator_4=Lasso(alpha=0.1)

full_pipeline_4 = Pipeline([
    ('preprocessing', transformer),
    ('estimator_4', estimator_4)
])

In [None]:
full_pipeline_4.fit(X_train, y_train)

In [None]:
full_pipeline_4.score(X_train, y_train), full_pipeline_4.score(X_test, y_test)

In [None]:
y_pred4=full_pipeline_4.predict(X_test)

In [None]:
r2_score(y_test, y_pred4)

In [None]:
sns.kdeplot(y_test, color='purple', label='Actual')
sns.kdeplot(y_pred4, color='black', label='Predicted')
plt.legend()
plt.show();

Model Performance Comparison: R² Score and MSE

In [None]:
pipelines = {
    'Pipeline 1': full_pipeline_1,
    'Pipeline 2': full_pipeline_2,
    'Pipeline 3': full_pipeline_3,
    'Pipeline 4': full_pipeline_4,
}

results = []

for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    r2 = r2_score(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    results.append({'Model': name, 'R2_Score': r2, 'MSE': mse})

results_df = pd.DataFrame(results)

plt.figure(figsize=(14,6))

plt.subplot(1, 2, 1)
sns.barplot(data=results_df, x='Model', y='R2_Score', hue='Model', palette='viridis', legend=False)
plt.title('R² Score Comparison')
plt.ylim(0, 1)

plt.subplot(1, 2, 2)
sns.barplot(data=results_df, x='Model', y='MSE', hue='Model', palette='magma', legend=False)
plt.title('Mean Squared Error Comparison')

plt.tight_layout();
plt.show();