# Cannabis Strain Anaslysis

In [13]:
import os
import collections
import itertools

from fastfm2 import als

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics, linear_model
from scipy import sparse


## Load Data

In [2]:
DATA_DIR = os.getcwd()
CSV_PATH = os.path.join(DATA_DIR, "cannabis.csv")


In [3]:
class CannabisStrainAnalysis:
    def __init__(self, csv_path):
        self.categorical_single_columns = ["Type"]
        self.categorical_list_columns = ["Effects", "Flavors"]
        
        self.csv_path = csv_path
        df = pd.read_csv(self.csv_path)
        self.df = self.clean_dataframe(df)
        self.categorical_column_to_unique_categories_dict = self.get_categorical_column_to_unique_categories_dict()
        
    def clean_dataframe(self, df):        
        df = df.rename(mapper={"Flavor": "Flavors"}, axis=1) # fix name of Flavor column

        df.dropna(inplace=True)

        remove_null_cols = [
            "Rating",
            "Type",
            "Effects",
            "Flavors"
        ]
        masks = [(df[col] == 0.0) | (df[col] == "None") | (df[col].isnull()) for col in remove_null_cols]
        mask = ~np.logical_or.reduce(masks)
        df = df[mask]
        
        for col in self.categorical_list_columns:
            df[col] = [s.lower().split(',') for s in list(df[col])]
        
        df.reset_index(drop=True, inplace=True)
        
        return df
        
    def get_categorical_column_to_unique_categories_dict(self):
        categorical_column_to_unique_categories_dict = {}
        
        for col in self.categorical_single_columns:
            categorical_column_to_unique_categories_dict[col] = sorted(list(np.unique(self.df[col])))
            
        for col in self.categorical_list_columns:
            categorical_column_to_unique_categories_dict[col] = sorted(list(np.unique([category for categories_list in self.df[col] for category in categories_list])))
            
        return categorical_column_to_unique_categories_dict

    def transform_strain_to_x(self, strain):
        # strain should be a dict whose keys are the categorical columns
        x = []
        
        for col in self.categorical_single_columns:
            unique_categories = self.categorical_column_to_unique_categories_dict[col]
            category_witnessed = strain[col]
            assert category_witnessed in set(unique_categories)
            x += [1.0 if category == category_witnessed else 0 for category in unique_categories]
        
        for col in self.categorical_list_columns:
            unique_categories = self.categorical_column_to_unique_categories_dict[col]
            categories_list = strain[col]
            assert set(categories_list).issubset(set(unique_categories))
            #x += [1 if category in categories_list else 0 for category in unique_categories]
            x += [float(1/len(categories_list)) if category in categories_list else 0 for category in unique_categories] # divides value by number of non-zero values of same category type

        return np.array(x)

    def get_X(self):        
        X = np.array([self.transform_strain_to_x(strain) for strain in self.df.to_dict("records")])
        
        return X
    
    def get_y(self):
        return np.array(self.df["Rating"])
    

In [4]:
analysis = CannabisStrainAnalysis(CSV_PATH)


Each i^th strain has been transformed into two features:

- X_i: a vector encoding the type, effects, and flavor of the strain.

- y_i: a float representing the rating of the strain. 

Here is an example of the transformation of a strain to its corresponding X_i vector:

In [5]:
analysis.transform_strain_to_x({
    "Type": "hybrid",
    "Effects": ["hungry", "energetic"],
    "Flavors": ["earthy", "citrus", "sweet"]
})

array([1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.5       , 0.        , 0.        , 0.        ,
       0.        , 0.5       , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.33333333, 0.        ,
       0.        , 0.33333333, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.33333333, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [6]:
analysis.get_X().shape

(2155, 67)

In [10]:
X = sparse.csr_matrix(analysis.get_X())
y = analysis.get_y()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline Algorithm: Just Predict The Mean

In [27]:
y_pred = np.array([np.mean(y_train)] * len(y_test))
metrics.mean_squared_error(y_test, y_pred)

0.11045012475708034

## Simple Algorithm: Linear Regression

In [11]:
reg = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
metrics.mean_squared_error(y_test, y_pred)

0.11530658014837322

## Complex Algorithm: Factorization Machines

In [23]:
fm = als.FMRegression(n_iter=1000, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)
metrics.mean_squared_error(y_test, y_pred)

0.12333409659444766

## Observations

Both algorithms attempted perform slightly worse than just predicting the mean rating. Given this, it is likely that the data set is not very informative and there are not meaningful patterns to be found between the type, effects, and flavors of a cannabis strain and its corresponding rating.

In other words, it is unlikely that the rating of a given cannabis strain has much to do with its listed features. 