# Data Experiments and Final Data Analysis

In [1]:
{
 "kernelspec": {
  "display_name": "Python 3",
  "language": "python",
  "name": "python3"
 },
 "language_info": {
  "codemirror_mode": {
   "name": "ipython",
   "version": 3
  },
  "file_extension": ".py",
  "mimetype": "text/x-python",
  "name": "python",
  "nbconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": "3.8.0"
 }
}

{'kernelspec': {'display_name': 'Python 3',
  'language': 'python',
  'name': 'python3'},
 'language_info': {'codemirror_mode': {'name': 'ipython', 'version': 3},
  'file_extension': '.py',
  'mimetype': 'text/x-python',
  'name': 'python',
  'nbconvert_exporter': 'python',
  'pygments_lexer': 'ipython3',
  'version': '3.8.0'}}

In [2]:
import numpy as np
import pandas as pd
import sqlite3
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

# Connect to the database and load the dataset
conn = sqlite3.connect('mobile_phones.db')
query = """
SELECT 
    p.phone_id,
    p.battery_power,
    p.clock_speed,
    p.m_dep,
    p.mobile_wt,
    p.n_cores,
    p.ram,
    p.talk_time,
    p.price_range,
    s.px_height,
    s.px_width,
    s.sc_h,
    s.sc_w,
    c.fc as front_camera,
    c.pc as primary_camera,
    f.blue,
    f.dual_sim,
    f.four_g,
    f.touch_screen,
    f.wifi,
    st.int_memory
FROM phones p
JOIN screen_specs s ON p.phone_id = s.phone_id
JOIN camera_specs c ON p.phone_id = c.phone_id
JOIN phone_features f ON p.phone_id = f.phone_id
JOIN storage_specs st ON p.phone_id = st.phone_id
"""
df_db = pd.read_sql_query(query, conn)
conn.close()

In [3]:
# Preprocessing pipeline
numerical_features = [
    'battery_power', 'clock_speed', 'm_dep', 'mobile_wt',
    'n_cores', 'ram', 'talk_time', 'sc_h', 'sc_w', 'int_memory'
]
categorical_features = ['blue', 'dual_sim', 'four_g', 'touch_screen', 'wifi']

numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('minmax', MinMaxScaler()),
    ('log', FunctionTransformer(np.log1p))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Define scoring metrics
scoring = {
    'f1_macro': f1_score,
    'precision_macro': precision_score,
    'recall_macro': recall_score,
    'accuracy': accuracy_score
}

# Split dataset
X = df_db.drop(columns=['price_range', 'phone_id'])
y = df_db['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize experiment tracking
results = []

In [4]:
%pip install --upgrade xgboost

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Experiment 1: Logistic Regression with preprocessing and hyperparameter tuning
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=2000, random_state=42))
])

param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_macro')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

results.append({'Experiment': 'Experiment 1', 'Model': 'Logistic Regression', 'F1': f1, 'Accuracy': accuracy, 'Overfitting': False})

In [6]:
from IPython.display import display

print(accuracy)
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display the results as a table
display(results_df)

0.8250291618063657


Unnamed: 0,Experiment,Model,F1,Accuracy,Overfitting
0,Experiment 1,Logistic Regression,0.824288,0.825029,False


In [7]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

# Suppress all warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=2000, random_state=42),
    'Ridge Classifier': RidgeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='mlogloss')
}

# Compare models
for name, model in models.items():
    try:
        pipeline = Pipeline([
            ('preprocessor', preprocessor),  # Ensure preprocessor is defined
            ('classifier', model)
        ])

        # Perform cross-validation
        cv_results = cross_validate(
            pipeline, X_train, y_train, cv=10, scoring='f1_macro', return_train_score=True
        )

        # Fit and predict
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        # Calculate metrics
        f1 = f1_score(y_test, y_pred, average='macro')
        accuracy = accuracy_score(y_test, y_pred)

        # Append results
        results.append({
            'Experiment': 'Experiment 2',
            'Model': name,
            'F1': f1,
            'Accuracy': accuracy,
        })
    except Exception:
        print(f"Failed to run {name}")

KeyboardInterrupt: 

In [15]:

# Experiment 3: Feature engineering
X_train['total_pixels'] = X_train['px_height'] * X_train['px_width']
X_test['total_pixels'] = X_test['px_height'] * X_test['px_width']

X_train['performance'] = X_train['battery_power'] * X_train['ram']
X_test['performance'] = X_test['battery_power'] * X_test['ram']

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)

results.append({'Experiment': 'Experiment 3', 'Model': 'Random Forest', 'F1': f1, 'Accuracy': accuracy, 'Overfitting': False})

In [None]:
from IPython.display import display

print(accuracy)
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display the results as a table
display(results_df)

In [17]:
# Experiment 4: Feature Selection

# First apply preprocessing to get consistent features
preprocessed_train = preprocessor.fit_transform(X_train)
preprocessed_test = preprocessor.transform(X_test)

# Get feature names after preprocessing
feature_names = (
    numerical_features +
    [f"{feat}_{val}" for feat, vals in 
     zip(categorical_features, 
         preprocessor.named_transformers_['cat'].categories_) 
     for val in vals]
)

# Convert to DataFrame with proper feature names - remove toarray()
X_train_processed = pd.DataFrame(
    preprocessed_train,
    columns=feature_names,
    index=X_train.index
)
X_test_processed = pd.DataFrame(
    preprocessed_test,
    columns=feature_names,
    index=X_test.index
)

# Now apply Variance Threshold
variance_thresholder = VarianceThreshold(threshold=0.1)
X_train_var = variance_thresholder.fit_transform(X_train_processed)
X_test_var = variance_thresholder.transform(X_test_processed)

# Get selected feature names
selected_features = X_train_processed.columns[variance_thresholder.get_support()]

# Create pipeline for the classifier only (preprocessing already done)
pipeline = Pipeline([
    ('classifier', LogisticRegression(max_iter=2000, random_state=42))
])

# Perform cross-validation
cv_results = cross_validate(
    pipeline, X_train_var, y_train,
    cv=10, scoring='f1_macro',
    return_train_score=True
)

# Aggregate results
mean_f1 = np.mean(cv_results['test_score'])
std_f1 = np.std(cv_results['test_score'])
train_f1 = np.mean(cv_results['train_score'])

# Fit and evaluate
pipeline.fit(X_train_var, y_train)
y_pred = pipeline.predict(X_test_var)

f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)

# Detect overfitting
overfitting = train_f1 - mean_f1 > 0.1

# Append results for comparison
results.append({
    'Experiment': 'Experiment 4',
    'Model': 'Logistic Regression with Feature Selection',
    'F1': f1,
    'Accuracy': accuracy,
    'Overfitting': overfitting
})

In [None]:
from IPython.display import display

print(accuracy)
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display the results as a table
display(results_df)

In [None]:
# Experiment 5: PCA
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

pipeline = Pipeline([
    ('classifier', LogisticRegression(max_iter=2000, random_state=42))
])

cv_results = cross_validate(pipeline, X_train_pca, y_train, cv=10, scoring='f1_macro', return_train_score=True)

mean_f1 = np.mean(cv_results['test_score'])
std_f1 = np.std(cv_results['test_score'])
train_f1 = np.mean(cv_results['train_score'])

pipeline.fit(X_train_pca, y_train)
y_pred = pipeline.predict(X_test_pca)

f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)

overfitting = train_f1 - mean_f1 > 0.1  # Check for overfitting

# Append results for comparison
results.append({'Experiment': 'Experiment 5', 'Model': 'Logistic Regression', 'F1': f1, 'Accuracy': accuracy, 'Overfitting': overfitting})

# Save results and compare
results_df = pd.DataFrame(results)
results_df.to_csv('experiment_results.csv', index=False)

# Plot F1-scores for comparison
plt.figure(figsize=(10, 6))
plt.barh(results_df['Experiment'], results_df['F1'], color='skyblue')
plt.xlabel('F1 Score')
plt.ylabel('Experiment')
plt.title('Comparison of Experiments by F1 Score')
plt.show()

In [None]:
# Find the best model
results_df['IsBest'] = results_df['Overfitting'] == False
best_idx = results_df[results_df['IsBest']]['F1'].idxmax()  # Get the index of the highest F1 score
best_experiment = results_df.loc[best_idx]  # Use loc to get the row corresponding to that index
print("Best Experiment:", best_experiment)

In [21]:
# import numpy as np
# import pandas as pd
# import sqlite3
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.decomposition import PCA
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.feature_selection import VarianceThreshold
# from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
# import matplotlib.pyplot as plt

# # Connect to the database and load the dataset
# conn = sqlite3.connect('mobile_phones.db')
# query = """
# SELECT 
#     p.phone_id,
#     p.battery_power,
#     p.clock_speed,
#     p.m_dep,
#     p.mobile_wt,
#     p.n_cores,
#     p.ram,
#     p.talk_time,
#     p.price_range,
#     s.px_height,
#     s.px_width,
#     s.sc_h,
#     s.sc_w,
#     c.fc as front_camera,
#     c.pc as primary_camera,
#     f.blue,
#     f.dual_sim,
#     f.four_g,
#     f.touch_screen,
#     f.wifi,
#     st.int_memory
# FROM phones p
# JOIN screen_specs s ON p.phone_id = s.phone_id
# JOIN camera_specs c ON p.phone_id = c.phone_id
# JOIN phone_features f ON p.phone_id = f.phone_id
# JOIN storage_specs st ON p.phone_id = st.phone_id
# """
# df_db = pd.read_sql_query(query, conn)
# conn.close()

# # Preprocessing pipeline
# numerical_features = [
#     'battery_power', 'clock_speed', 'm_dep', 'mobile_wt',
#     'n_cores', 'ram', 'talk_time', 'sc_h', 'sc_w', 'int_memory'
# ]
# categorical_features = ['blue', 'dual_sim', 'four_g', 'touch_screen', 'wifi']

# numerical_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('minmax', MinMaxScaler()),
#     ('log', FunctionTransformer(np.log1p))
# ])

# preprocessor = ColumnTransformer([
#     ('num', numerical_pipeline, numerical_features),
#     ('cat', OneHotEncoder(), categorical_features)
# ])

# # Define scoring metrics
# scoring = {
#     'f1_macro': make_scorer(f1_score, average='macro'),
#     'precision_macro': make_scorer(precision_score, average='macro'),
#     'recall_macro': make_scorer(recall_score, average='macro'),
#     'accuracy': make_scorer(accuracy_score)
# }

# # Split dataset
# X = df_db.drop(columns=['price_range', 'phone_id'])
# y = df_db['price_range']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize experiment tracking
# results = []

# # Experiment 6: Advanced Ensemble Model (Gradient Boosting)
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', GradientBoostingClassifier(random_state=42))
# ])

# param_grid = {
#     'classifier__n_estimators': [100, 200],
#     'classifier__learning_rate': [0.01, 0.1, 0.2],
#     'classifier__max_depth': [3, 5, 7]
# }

# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_macro')
# grid_search.fit(X_train, y_train)

# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)

# f1 = f1_score(y_test, y_pred, average='macro')
# precision = precision_score(y_test, y_pred, average='macro')
# recall = recall_score(y_test, y_pred, average='macro')
# accuracy = accuracy_score(y_test, y_pred)
# conf_matrix = confusion_matrix(y_test, y_pred)

# results.append({'Experiment': 'Experiment 6', 'Model': 'Gradient Boosting', 'F1': f1, 'Accuracy': accuracy, 'Overfitting': False})

# # Experiment 7: Combining Feature Selection and PCA
# variance_thresholder = VarianceThreshold(threshold=0.1)
# X_train_var = variance_thresholder.fit_transform(X_train)
# X_test_var = variance_thresholder.transform(X_test)

# pca = PCA(n_components=5)
# X_train_pca = pca.fit_transform(X_train_var)
# X_test_pca = pca.transform(X_test_var)

# pipeline = Pipeline([
#     ('classifier', RandomForestClassifier(random_state=42))
# ])

# pipeline.fit(X_train_pca, y_train)
# y_pred = pipeline.predict(X_test_pca)

# f1 = f1_score(y_test, y_pred, average='macro')
# accuracy = accuracy_score(y_test, y_pred)

# results.append({'Experiment': 'Experiment 7', 'Model': 'Random Forest with PCA', 'F1': f1, 'Accuracy': accuracy, 'Overfitting': False})

# # Save results and compare
# results_df = pd.DataFrame(results)
# results_df.to_csv('experiment_results.csv', index=False)

# # Plot F1-scores for comparison
# plt.figure(figsize=(10, 6))
# plt.barh(results_df['Experiment'], results_df['F1'], color='skyblue')
# plt.xlabel('F1 Score')
# plt.ylabel('Experiment')
# plt.title('Comparison of Experiments by F1 Score')
# plt.show()

# # Find the best model
# results_df['IsBest'] = results_df['Overfitting'] == False
# best_experiment = results_df[results_df['IsBest']].loc[results_df['F1'].idxmax()]
# print("Best Experiment:", best_experiment)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Sample DataFrame to illustrate functionality
data = {
    "Experiment": [
        "Experiment 1", "Experiment 2: XGBoost", "Experiment 2: Random Forest",
        "Experiment 2: Ridge Classifier", "Experiment 3", "Experiment 4",
        "Experiment 5", "Experiment 6", "Experiment 7"
    ],
    "F1": [0.791, 0.909, 0.802, 0.582, 0.874, 0.762, 0.850, 0.887, 0.909],
    "Accuracy": [0.798, 0.908, 0.802, 0.580, 0.875, 0.760, 0.852, 0.884, 0.907],
    "Overfitting": [False, True, False, False, False, False, False, False, False]
}
results_df = pd.DataFrame(data)

# Create the plot for F1 scores
plt.figure(figsize=(12, 6))
plt.bar(results_df['Experiment'], results_df['F1'], color='skyblue', alpha=0.8, label='F1 Score')
plt.axhline(y=results_df['F1'].max(), color='red', linestyle='--', linewidth=1, label='Best F1 Score')

# Annotate the best model
best_experiment = results_df.loc[results_df['F1'].idxmax()]
plt.text(
    x=results_df['Experiment'].tolist().index(best_experiment['Experiment']),
    y=best_experiment['F1'] + 0.01,
    s=f"Best: {best_experiment['Experiment']} ({best_experiment['F1']:.3f})",
    color='red', fontsize=10, ha='center'
)

# Add plot details
plt.title('F1-Score Comparison Across Experiments', fontsize=14)
plt.xlabel('Experiments', fontsize=12)
plt.ylabel('F1 Score', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

Although XGBoost was found to be the best model, it had overfitting problem so i went with Random forest instead.

In [None]:
from sklearn.pipeline import Pipeline
import joblib

try:
    # Combine preprocessing, PCA, and classifier into one pipeline
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),  # Preprocessing steps
        ('pca', PCA(n_components=5)),    # PCA step
        ('classifier', RandomForestClassifier(random_state=42))  # Classifier
    ])

    # Fit the full pipeline
    full_pipeline.fit(X_train, y_train)

    # Save the full pipeline
    model_filename = "random_forest_pca_full_pipeline.joblib"
    joblib.dump(full_pipeline, model_filename)
    print(f"Full pipeline saved as {model_filename}")

except Exception:
    print("Failed to run")

In [None]:
import joblib
import pandas as pd

try:
    # Load the full pipeline
    model_filename = "random_forest_pca_full_pipeline.joblib"
    full_pipeline = joblib.load(model_filename)
    print(f"Full pipeline loaded from {model_filename}")

    # Load test cases
    test_samples_file = "test_samples.json"
    test_cases = pd.read_json(test_samples_file)
    print("Test cases loaded:")
    print(test_cases)

    # Use the full pipeline to predict
    predictions = full_pipeline.predict(test_cases)

    # Display results
    test_cases['Predicted Price Range'] = predictions
    print("Test case predictions:")
    print(test_cases)

    # Save results for further inspection
    test_cases.to_csv("test_case_predictions.csv", index=False)
    print("Predictions saved to test_case_predictions.csv")

except Exception:
    print("Failed to run")

In [None]:
from IPython.display import display

print(accuracy)
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display the results as a table
display(results_df)