In [1]:
import pandas as pd

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from scipy import stats

In [15]:

# Load the dataset
df = pd.read_csv("/content/final_assignment.csv")


In [16]:
df.isnull().sum()

Unnamed: 0,0
Customer ID,0
Purchase Date,0
Product Category,0
Product Price,0
Quantity,0
Total Purchase Amount,0
Payment Method,0
Customer Age,0
Returns,970
Customer Name,0


In [17]:

# STEP 2: Drop irrelevant columns
df.drop(['Customer ID', 'Purchase Date', 'Customer Name'], axis=1, inplace=True)

# STEP 3: Handle missing values
df['Returns'] = df['Returns'].fillna(0)
df = df.dropna()  # drop rows with missing categorical/numeric values


In [18]:
# STEP 4: Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# STEP 5: Z-score outlier removal on numeric features only
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]

In [19]:
# STEP 6: Separate features and target
X = df.drop('Returns', axis=1)
y = df['Returns']

# Edge Case: Ensure X has at least 1 row left
if X.empty:
    raise ValueError("No data left after preprocessing. Check for overly strict filtering.")


In [20]:
# STEP 7: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# STEP 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [21]:
# STEP 9: Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [22]:
# STEP 10: Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5966666666666667
              precision    recall  f1-score   support

         0.0       0.60      1.00      0.75       895
         1.0       0.00      0.00      0.00       605

    accuracy                           0.60      1500
   macro avg       0.30      0.50      0.37      1500
weighted avg       0.36      0.60      0.45      1500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:

# STEP 1: Load dataset
df = pd.read_csv("/content/final_assignment.csv")

# STEP 2: Drop irrelevant columns
df.drop(['Customer ID', 'Purchase Date', 'Customer Name'], axis=1, inplace=True)

# STEP 3: Handle missing values
df['Returns'] = df['Returns'].fillna(0)
df = df.dropna()

# STEP 4: Encode categorical variables
df = pd.get_dummies(df, drop_first=True)


In [26]:
pip install imbalanced-learn



In [29]:
from imblearn.over_sampling import SMOTE
# STEP 5: Remove outliers using z-scores
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]

# STEP 6: Split features and target
X = df.drop('Returns', axis=1)
y = df['Returns']

# STEP 7: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# STEP 8: Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)


In [31]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from scipy import stats
# STEP 9: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# STEP 10: Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_log_pred = log_model.predict(X_test)

print("\n=== Logistic Regression Results ===")
print("Accuracy:", accuracy_score(y_test, y_log_pred))
print(classification_report(y_test, y_log_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_log_pred))


=== Logistic Regression Results ===
Accuracy: 0.49887640449438203
              precision    recall  f1-score   support

         0.0       0.50      0.61      0.55       884
         1.0       0.50      0.39      0.44       896

    accuracy                           0.50      1780
   macro avg       0.50      0.50      0.49      1780
weighted avg       0.50      0.50      0.49      1780

Confusion Matrix:
 [[537 347]
 [545 351]]


In [33]:
from sklearn.ensemble import RandomForestClassifier
# STEP 11: Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_rf_pred = rf_model.predict(X_test)

print("\n=== Random Forest Results ===")
print("Accuracy:", accuracy_score(y_test, y_rf_pred))
print(classification_report(y_test, y_rf_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_rf_pred))


=== Random Forest Results ===
Accuracy: 0.6089887640449438
              precision    recall  f1-score   support

         0.0       0.60      0.62      0.61       884
         1.0       0.62      0.60      0.61       896

    accuracy                           0.61      1780
   macro avg       0.61      0.61      0.61      1780
weighted avg       0.61      0.61      0.61      1780

Confusion Matrix:
 [[550 334]
 [362 534]]


In [35]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree with pruning
dt_model = DecisionTreeClassifier(
    random_state=42,
    max_depth=5,             # Limits depth of the tree
    min_samples_split=10,    # Requires at least 10 samples to split an internal node
    ccp_alpha=0.005          # Cost-complexity pruning (use GridSearchCV to tune)
)

dt_model.fit(X_train, y_train)
y_dt_pred = dt_model.predict(X_test)

print("\n=== Pruned Decision Tree Results ===")
print("Accuracy:", accuracy_score(y_test, y_dt_pred))
print(classification_report(y_test, y_dt_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_dt_pred))


=== Pruned Decision Tree Results ===
Accuracy: 0.4966292134831461
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.66       884
         1.0       0.00      0.00      0.00       896

    accuracy                           0.50      1780
   macro avg       0.25      0.50      0.33      1780
weighted avg       0.25      0.50      0.33      1780

Confusion Matrix:
 [[884   0]
 [896   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:

# STEP 1: Load dataset
df = pd.read_csv("/content/final_assignment.csv")
# STEP 2: Drop irrelevant columns
df.drop(['Customer ID', 'Purchase Date', 'Customer Name'], axis=1, inplace=True)


In [37]:
# STEP 3: Split into known (for training) and unknown (to predict later)
df_known = df[df['Returns'].notnull()]
df_unknown = df[df['Returns'].isnull()].drop('Returns', axis=1)

# STEP 4: Preprocess known data
# Encode categorical variables
df_known_encoded = pd.get_dummies(df_known, drop_first=True)

In [38]:
# Separate features and target
X = df_known_encoded.drop('Returns', axis=1)
y = df_known_encoded['Returns']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [39]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# STEP 5: Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


In [40]:
# Evaluate
y_pred = rf_model.predict(X_test)
print("\n=== Evaluation on Known Data ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



=== Evaluation on Known Data ===
Accuracy: 0.4747725392886683
              precision    recall  f1-score   support

         0.0       0.48      0.48      0.48       607
         1.0       0.47      0.47      0.47       602

    accuracy                           0.47      1209
   macro avg       0.47      0.47      0.47      1209
weighted avg       0.47      0.47      0.47      1209



In [41]:
# STEP 6: Predict missing returns
# Preprocess unknown rows similarly
df_unknown_encoded = pd.get_dummies(df_unknown, drop_first=True)

# Align columns with training data (in case of missing dummies)
df_unknown_encoded = df_unknown_encoded.reindex(columns=X.columns, fill_value=0)

# Scale
X_unknown_scaled = scaler.transform(df_unknown_encoded)

# Predict
predicted_returns = rf_model.predict(X_unknown_scaled)

# Output predicted returns
print("\n=== Predicted Returns for Previously Missing Rows ===")
print(predicted_returns)


=== Predicted Returns for Previously Missing Rows ===
[0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1.
 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0.
 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.
 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1.
 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1.
 0. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1.
 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1.
 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0.
 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0.
 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0.