In [1]:
# Import libraries
import pandas as pd
import numpy as np 
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


✅ Explanation:
These are Python libraries used for data science and machine learning:

    pandas → For handling data (tables, CSV files).

    numpy → For numerical operations.

    joblib → For saving/loading trained models.

    sklearn.model_selection.train_test_split → For splitting data into training and testing sets.

    LabelEncoder → Converts text labels into numbers.

    RandomForestClassifier → A machine learning algorithm used for classification.

    classification_report, confusion_matrix, accuracy_score → Tools to evaluate how well the model performs.

In [3]:
# Load dataset
file_path = r"C:\Users\Iskcon Kainth\OneDrive\Desktop\Projects\capstone project Drug consumption\Drug Consumption (Test Dataset).csv"
df= pd.read_csv(file_path)

✅ Explanation:
This code loads the dataset (CSV file) that contains data related to drug consumption into a DataFrame named df.

In [4]:
df.head(5)

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Alcohol,Amphet,Amyl,Benzos,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,2,25-34,M,Doctorate degree,UK,White,CL5,CL2,CL2,CL0,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
1,3,35-44,M,Professional certificate/ diploma,UK,White,CL6,CL0,CL0,CL0,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
2,4,18-24,F,Masters degree,UK,White,CL4,CL0,CL0,CL3,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
3,5,35-44,F,Doctorate degree,UK,White,CL4,CL1,CL1,CL0,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
4,6,65+,F,Left school at 18 years,Canada,White,CL2,CL0,CL0,CL0,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0


✅ Explanation:
This line shows the first 10 rows of the data so we can understand what kind of data we are dealing with — like names of columns and a few sample values.

In [5]:
df.isnull().sum()
df.duplicated().sum()

0

In [6]:
print('Shape of the data set: ' + str(df.shape))

Shape of the data set: (1884, 25)


✅ Explanation:
This tells us how many rows and columns are in the dataset. For example: (1000, 13) means 1000 records and 13 columns.

In [7]:
#save labels as string
Labels = df['Country']
Data = df.drop(['Gender', 'Country'], axis = 1)
Labels_keys = Labels.unique().tolist()
Labels = np.array(Labels)
print('Eduaction labels: ' + str(Labels_keys))

Eduaction labels: ['UK', 'Canada', 'USA', 'Other', 'Australia', 'Republic of Ireland', 'New Zealand']


✅ Explanation:

    Labels = df['Country'] → We are trying to predict which country the person is from.

    Data = df.drop(['Gender', 'Country'], axis=1) → Removing 'Gender' and 'Country' from the data used for prediction.

    Labels_keys shows all the unique countries in the dataset.

    Labels is converted into a NumPy array for machine learning use.

In [8]:
# Create binary target: 0 = Non-user (CL0, CL1, CL2), 1 = Regular user (CL3, CL4, CL5, CL6)
df['Cannabis_Binary'] = df['Cannabis'].apply(lambda x: 0 if x in ['CL0', 'CL1', 'CL2'] else 1)

✅ Explanation:

    The original Cannabis column has many levels (CL0 to CL6).

    We're simplifying this by creating a binary column:

        0 → Non-user (low consumption)

        1 → Regular user (high consumption)

In [9]:
# Drop unused columns
df_model = df.drop(columns=['ID', 'Cannabis'])

✅ Explanation:
We remove the ID (not useful for prediction) and the original Cannabis column (we replaced it with Cannabis_Binary).

In [11]:
# Encode categorical features
encoders = {}
for column in df_model.columns:
    le = LabelEncoder()
    df_model[column] = le.fit_transform(df_model[column])
    encoders[column] = le

✅ Explanation:
Most columns have text (like education level, country, etc.).
We convert them into numbers using LabelEncoder, so that the machine learning model can understand them.

In [12]:
# Encode categorical variables
df_encoded = df_model.apply(LabelEncoder().fit_transform)

✅ Explanation:
This is a simpler one-liner to label encode all columns in df_model.
It’s a more compact version of the previous step.

In [13]:
# Define features and target
X = df_encoded.drop(columns=['Cannabis_Binary'])
y = df_encoded['Cannabis_Binary']

✅ Explanation:

    X contains all the input features (like age, education, etc.)

    y is the target — whether a person is a regular cannabis user or not

In [14]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

✅ Explanation:

    We divide the data: 80% for training, 20% for testing.

    random_state=42 ensures reproducible results.

In [15]:
# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

✅ Explanation:
We use the Random Forest classifier to train the model using the training data.

In [16]:
# Predictions and evaluation
y_pred = model.predict(X_test)

✅ Explanation:
The trained model is used to predict the cannabis use (binary) for the test data.

In [17]:
# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

✅ Explanation:
We calculate:

    Accuracy → How many predictions were correct.

    Classification Report → Gives precision, recall, F1-score.

    Confusion Matrix → Shows how many users/non-users were classified correctly or incorrectly.

In [18]:
# Print results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.870026525198939

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.83      0.87       191
           1       0.84      0.91      0.87       186

    accuracy                           0.87       377
   macro avg       0.87      0.87      0.87       377
weighted avg       0.87      0.87      0.87       377

Confusion Matrix:
 [[159  32]
 [ 17 169]]


✅ Explanation:
This prints:

    The overall accuracy of the model.

    Detailed classification report (precision, recall, F1-score).

    Confusion matrix (how well it distinguished between users and non-users).

In [19]:
# Save the trained model
joblib.dump(model, "cannabis_rf_model.pkl")

['cannabis_rf_model.pkl']

✅ Explanation:
Saves the trained model into a file called cannabis_rf_model.pkl.
This allows you to reuse it later without retraining.

In [20]:
# Save the encoders
joblib.dump(encoders, "cannabis_encoders.pkl")
print("\nModel and encoders saved successfully.")


Model and encoders saved successfully.


✅ Explanation:
Saves the label encoders used to convert text data into numbers.
You’ll need them again when predicting new data.

In [21]:
# Load the trained model and encoders
model = joblib.load("cannabis_rf_model.pkl")
encoders = joblib.load("cannabis_encoders.pkl")

# Example: Predict on new raw data (DataFrame with same structure)
new_data = pd.DataFrame([{
    'Age': '25-34',
    'Gender': 'M',
    'Education': 'Doctorate degree',
    'Country': 'UK',
    'Ethnicity': 'White',
    'Alcohol': 'CL5',
    'Amphet': 'CL2',
    'Amyl': 'CL2',
    'Benzos': 'CL0',
    'Caff': 'CL6',
    'Choc': 'CL5',
    'Coke': 'CL2',
    'Crack': 'CL0',
    'Ecstasy': 'CL4',
    'Heroin': 'CL0',
    'Ketamine': 'CL2',
    'Legalh': 'CL0',
    'LSD': 'CL2',
    'Meth': 'CL3',
    'Mushrooms': 'CL0',
    'Nicotine': 'CL4',
    'Semer': 'CL0',
    'VSA': 'CL0'
}])

# Apply encoders
for column in new_data.columns:
    new_data[column] = encoders[column].transform(new_data[column])

# Predict
prediction = model.predict(new_data)
print("Predicted class (0 = Non-user, 1 = Regular user):", prediction[0])

Predicted class (0 = Non-user, 1 = Regular user): 1


✅ Explanation:
This part simulates how you can use the model in real life:

    A new person’s data (age, gender, education, drug usage, etc.) is entered.

    It’s processed using the same encoders.

    The model predicts whether that person is a cannabis user (1) or non-user (0).

In [28]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print("CV Accuracy:", scores.mean())

CV Accuracy: 0.8545882950505108
