### Arahan Assignment
0. Gunakan dataset student score
1. Lakukan proses Exploratory data analysis
2. Lakukan feature engineering :
- Check Duplicated Data
- Check Missing Value Handling
- Outlier Analysis
3. Lakukan modelling machine learning regression : gunakan minimal 2 model (linear regression, decision tree regressor, atau random forest regressor)
4. Lakukan evaluasi model
5. Berikan kesimpulan model mana yang terbaik performanya

In [None]:
# Import libraries and resources
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
dataset = pd.read_csv('student_scores.csv')
dataset.head()

In [None]:
# Exploratory Data Analysis (EDA)
print("Dataset Overview:/n", dataset.head())
print("\nDataset Info:\n")
dataset.info()
print("\nSummary Statistics:\n", dataset.describe())


In [None]:
# Visualizing Data
plt.figure(figsize=(8,5))
sns.scatterplot(x=dataset['Hours'], y=dataset['Scores'])
plt.xlabel('Hours Studied')
plt.ylabel('Scores')
plt.title('Hours vs Scores')
plt.show()

In [6]:
# Feature Engineering
# Check for duplicates
duplicates = dataset.duplicated().sum()
print(f"Number of duplicated rows: {duplicates}")



Number of duplicated rows: 0


In [None]:
# Check for missing values
missing_values = dataset.isnull().sum()
print("\nMissing Values:\n", missing_values)

# Outlier Analysis
sns.boxplot(data=dataset)
plt.show()

# Splitting Data into Train and Test
X = dataset[['Hours']]
y = dataset['Scores']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2 Score': r2}
    print(f"\n{name} Model Performance:")
    print(f"MSE: {mse:.2f}")
    print(f"R2 Score: {r2:.2f}")

# Comparing Model Performance
results_df = pd.DataFrame(results).T
print("\nModel Comparison:\n", results_df)