<a href="https://colab.research.google.com/github/leman-cap13/my_projects/blob/main/Laptop_Prices_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download anubhavgoyal10/laptop-prices-dataset

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('/content/laptop-prices-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df=pd.read_csv('/content/laptopPrice.csv')
df

#DATA CLEANING AND DATA VISUALIZATION

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
    print(f"{col} üçün outlier sayı: {len(outliers)}")


In [None]:
import math


numeric_df = df.select_dtypes(include=['float64', 'int64'])
cols = numeric_df.columns


n = len(cols)
rows = math.ceil(n / 3)

plt.figure(figsize=(15, rows * 4))

for i, col in enumerate(cols, 1):
    plt.subplot(rows, 3, i)
    sns.boxplot(x=df[col])
    plt.title(col)

plt.tight_layout()
plt.show()


In [None]:
def dataplot(col):
    plt.figure(figsize= (10,6))
    sns.countplot(data = df, x=col, palette = 'plasma')
    plt.xticks(rotation = 'vertical')
    plt.show()

features = ['brand', 'ram_gb', 'processor_name', 'processor_gnrtn', 'os']

for col in features:
    dataplot(col)

In [None]:
plt.figure(figsize=(10,7))
unique_brands = df['brand'].unique()
palette = sns.color_palette("hsv", len(unique_brands))
sns.barplot(x = df['brand'], y=df['Price'], palette=palette)
plt.xticks(rotation = 'vertical')
plt.show();

In [None]:
sns.barplot(x = df['Touchscreen'], y= df['Price'], palette = 'plasma')

In [None]:
sns.barplot(x='ram_gb', y='Price', hue='ram_gb', data=df, palette='plasma', legend=False)

In [None]:
sns.countplot(x ='os', data=df, hue='os', palette='plasma')

In [None]:
sns.barplot(x = df['os'], y= df['Price'], palette='magma')

In [None]:
sns.histplot(df['Price'], kde=True)

In [None]:
def remove_outliers(df):
  numeric_df = df.select_dtypes(include=['float64', 'int64'])
  for column in numeric_df.columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

  return df

In [None]:
df=remove_outliers(df)

In [None]:
df.head()

In [None]:
df['hdd']=df['hdd'].str.extract('(\d+)').astype('int')

In [None]:
df['hdd'].value_counts()

In [None]:
df['ram_gb']=df['ram_gb'].str.extract('(\d+)').astype('int')

In [None]:
df['ram_gb'].value_counts()

In [None]:
df['ssd']=df['ssd'].str.extract('(\d+)').astype('int')

In [None]:
df['ssd'].value_counts()

In [None]:
df.columns

In [None]:
df['os_bit']=df['os_bit'].str.extract('(\d+)').astype('int')

In [None]:
df['os_bit'].value_counts()

In [None]:
df['graphic_card_gb']=df['graphic_card_gb'].str.extract('(\d+)').astype('int')


In [None]:
df['graphic_card_gb'].value_counts()

In [None]:
df['rating']=df['rating'].str.extract('(\d+)').astype('int')

In [None]:
df['rating'].value_counts()

In [None]:
df['rating'].value_counts().plot(kind='bar')

In [None]:
df['processor_gnrtn'].unique()

In [None]:
df['processor_gnrtn'] = df['processor_gnrtn'].str.replace('th', '', regex=False)

In [None]:
df['processor_gnrtn'].dtype

In [None]:
df['processor_gnrtn'].value_counts()

In [None]:
df.head()

In [None]:
df.corr(numeric_only=True)

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True)

#Make Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#First do X and y  split
X=df.drop('Price', axis=1)
y=df['Price'].copy()

In [None]:
#Second make train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#split numeric and categorical values
num_feature = X_train.select_dtypes(include=[np.number]).columns
cat_feature = X_train.select_dtypes(exclude=[np.number]).columns

In [None]:
#make pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_feature),
    ('cat', cat_pipeline, cat_feature)
])

estimator=LinearRegression()

full_pipeline = Pipeline([
    ('preprocessing', transformer),
    ('estimator', estimator)
])

In [None]:
full_pipeline.fit(X_train, y_train)

In [None]:
full_pipeline.score(X_train, y_train), full_pipeline.score(X_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(full_pipeline, X_train, y_train, cv=5)

#Random forest Regressor

In [None]:
estimator_1=RandomForestRegressor(random_state=42, n_estimators=200, min_samples_split=3,min_samples_leaf=2,max_depth=10)

full_pipeline_1 = Pipeline([
    ('preprocessing', transformer),
    ('estimator_1', estimator_1)
])

full_pipeline_1.fit(X_train, y_train)

In [None]:
full_pipeline_1.score(X_train, y_train), full_pipeline_1.score(X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
params={
    'estimator_1__n_estimators': [50,100,200],
    'estimator_1__max_depth': [ 10, 20, 30],
    'estimator_1__min_samples_split': [2, 5, 10],
    'estimator_1__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(full_pipeline_1, params, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
y_pred=full_pipeline_1.predict(X_test)

In [None]:
y_pred[:10]

In [None]:
y_test[:10]

In [None]:
r2_score(y_test, y_pred)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mae

In [None]:
sns.kdeplot(y_test, color='red', label='Actual')
sns.kdeplot(y_pred, color='green', label='Predicted')
plt.legend()
plt.show()


#Decision tree regressor

In [None]:
estimator_2=DecisionTreeRegressor(random_state=42,  min_samples_split=3,min_samples_leaf=5,max_depth=15)

full_pipeline_2 = Pipeline([
    ('preprocessing', transformer),
    ('estimator_2', estimator_2)
])

full_pipeline_2.fit(X_train, y_train)

In [None]:
full_pipeline_2.score(X_train, y_train), full_pipeline_2.score(X_test, y_test)


In [None]:
y_prediction=full_pipeline_2.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_prediction)
mae

In [None]:
sns.kdeplot(y_test, color='orange', label='Actual')
sns.kdeplot(y_prediction, color='blue', label='Predicted')
plt.legend()
plt.show()


#Ridge and Lasso Model

In [None]:
from sklearn.linear_model import Ridge, Lasso
estimator_3=Ridge(alpha=0.005)

full_pipeline_3 = Pipeline([
    ('preprocessing', transformer),
    ('estimator_3', estimator_3)
])

full_pipeline_3.fit(X_train, y_train)

In [None]:
full_pipeline_3.score(X_train, y_train), full_pipeline_3.score(X_test, y_test)

In [None]:
estimator_4=Lasso(alpha=1)

full_pipeline_4 = Pipeline([
    ('preprocessing', transformer),
    ('estimator_4', estimator_4)
])

full_pipeline_4.fit(X_train, y_train)

In [None]:
full_pipeline_4.score(X_train, y_train), full_pipeline_4.score(X_test, y_test)

In [None]:
y_predd=full_pipeline_4.predict(X_test)

In [None]:
r2_score(y_test, y_predd)

In [None]:
sns.kdeplot(y_test, color='purple', label='Actual')
sns.kdeplot(y_predd, color='black', label='Predicted')
plt.legend()
plt.show()

# GradientBoostingRegressor Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
estimator_5 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
full_pipeline_5 = Pipeline([
    ('preprocessing', transformer),
    ('estimator_5', estimator_5)
])
full_pipeline_5.fit(X_train, y_train)

In [None]:
y_pre = full_pipeline_5.predict(X_test)
mse = mean_squared_error(y_test, y_pre)
print("Mean Squared Error:", mse)

In [None]:
mae = mean_absolute_error(y_test, y_pre)
mae

In [None]:
r2_score(y_test, y_pre)

In [None]:
full_pipeline_5.score(X_train, y_train), full_pipeline_5.score(X_test, y_test)

In [None]:
sns.kdeplot(y_test, color='purple', label='Actual')
sns.kdeplot(y_pre, color='black', label='Predicted')
plt.legend()
plt.show()
#