In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Create a sample dataset
data = {
    'ID': range(1, 10001),
    'Distributor': np.random.choice(['D1', 'D2', 'D3'], 10000),
    'Product': np.random.choice(['P1', 'P2', 'P3'], 10000),
    'Duration': np.random.randint(1, 100, 10000),
    'Destination': np.random.choice(['City_A', 'City_B', 'City_C'], 10000),
    'Sales': np.random.uniform(50, 500, 10000),
    'Commission': np.random.uniform(5, 50, 10000),
    'Gender': np.random.choice(['Male', 'Female'], 10000),
    'Age': np.random.randint(18, 65, 10000),
    'Target': np.random.choice([0, 1], 10000)
}

df = pd.DataFrame(data)

# Display the first few rows of the dataset
print("Sample Dataset:")
print(df.head())

# Split the dataset into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Save the datasets to CSV files
train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)

# Display information about the train and test datasets
print("\nTrain Dataset Info:")
print(train_data.info())

print("\nTest Dataset Info:")
print(test_data.info())

Sample Dataset:
   ID Distributor Product  Duration Destination       Sales  Commission  \
0   1          D3      P3        58      City_C  406.489205   38.527552   
1   2          D1      P2        42      City_B  251.740559   40.455429   
2   3          D3      P2        35      City_C  479.575342   34.742114   
3   4          D3      P1        38      City_B  116.526603   48.029763   
4   5          D1      P2        67      City_A  432.619246   43.890186   

   Gender  Age  Target  
0  Female   40       1  
1  Female   32       1  
2  Female   35       0  
3    Male   56       1  
4    Male   52       0  

Train Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 9254 to 7270
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           8000 non-null   int64  
 1   Distributor  8000 non-null   object 
 2   Product      8000 non-null   object 
 3   Duration     8000 non-null   int64  


In [7]:
from sklearn.preprocessing import LabelEncoder

# Load the training data
train_data = pd.read_csv('train.csv')

# Display the first few rows of the training data
print("Training Data:")
print(train_data.head())

# Load the test data
test_data = pd.read_csv('test.csv')

# Display the first few rows of the test data
print("\nTest Data:")
print(test_data.head())

# Combine training and test sets
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Separate features (X) and target variable (y)
X_combined = combined_data.drop(['ID', 'Target'], axis=1)
y_combined = combined_data['Target']

# Initialize label encoder
label_encoder = LabelEncoder()

# Encode categorical columns using LabelEncoder
for col in ['Distributor', 'Product', 'Destination', 'Gender']:
    X_combined[col] = label_encoder.fit_transform(X_combined[col])

# Split the combined data back into training and test sets
X_train = X_combined.iloc[:len(train_data)]
X_test = X_combined.iloc[len(train_data):]

# Split the target variable back into training and test sets
y_train = y_combined.iloc[:len(train_data)]

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create a RandomForestClassifier model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = model.predict(X_val)

# Evaluate the model
print("\nClassification Report on Validation Set:")
print(classification_report(y_val, val_predictions))
print("Accuracy on Validation Set:", accuracy_score(y_val, val_predictions))

# Make predictions on the test set
test_predictions = model.predict(X_test)

# Create submission file
submission_df = pd.DataFrame({'ID': test_data['ID'], 'Target': test_predictions})
submission_df.set_index('ID', inplace=True)

# Save submission file
submission_df.to_csv('submission.csv')

# Display the submission file
print("\nSubmission File:")
print(submission_df.head())


Training Data:
     ID Distributor Product  Duration Destination       Sales  Commission  \
0  9255          D3      P2        94      City_B  344.584362   16.686910   
1  1562          D1      P1        54      City_A  151.111182   29.401495   
2  1671          D1      P1        65      City_B  450.717781   36.677645   
3  6088          D2      P3        66      City_A  191.998018   44.772121   
4  6670          D2      P1        58      City_A   72.931270   10.972500   

   Gender  Age  Target  
0    Male   23       0  
1    Male   38       1  
2    Male   44       0  
3  Female   41       0  
4    Male   49       1  

Test Data:
     ID Distributor Product  Duration Destination       Sales  Commission  \
0  6253          D2      P2        10      City_A  473.160522   15.868434   
1  4685          D1      P3        53      City_B  396.729007    9.409580   
2  1732          D3      P1        80      City_C  284.415466   32.976160   
3  4743          D2      P2        37      City_B  4