# Cannabis Project

### Introduction

here goes the introduction

In [None]:
import warnings

In [None]:
#importing required modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#reading the csv file
df = pd.read_csv('datasets_254652_535029_OCPDB.csv')
df.head()

In [None]:
#Checking for duplicates
df[df.duplicated(subset=None, keep='first')==True]

In [None]:
#describing the dataset
df.describe()

No duplicated data exists in the dataset.

In [None]:
#getting the data types in columns
print(df.dtypes)

In [None]:
#Checking to see if there is any null data in our dataset:
df_nulls = df[df.isnull().apply(lambda x: max(x), axis=1)]
df_nulls

In all rows we have null values.

In [None]:
#finding the number of unique strains in the dataset
df['Strain'].nunique()

In [None]:
#counting the number of each unique strain in the dataset
df['Strain'].value_counts()

In [None]:
#counting the number of each report type
df['ReportType'].value_counts()

In [None]:
#checking the titles of the columns in the df
df.columns

In [None]:
#droping the columns which have no vlues in them or have useless values (same values)
df = df.drop(['OCPID','DateRecorded', 'SampleID', 'ReportType','ChemicalLab', 'Δ8-THC', 'THCV', 'CBDV', 'CBN', 'GeneticLab', 'Sample', 'SampleURL',
             'Organism', 'OrganismURL', 'Project', 'ProjectURL',
       'Study', 'StudyURL', 'Run', 'RunURL', 'DatePublished', 'Spots', 'Bases',
       'Size', 'Notes'], axis=1)

In [None]:
#checking the head of the cleaned df
df.head()

In [None]:
#replacing 
df['H2O']=df['H2O'].str.replace("%","")
df['H2O'] = pd.to_numeric(df['H2O'], errors='coerce')

In [None]:
#checking the titles of the columns in the df
df.columns

In [None]:
#selecting the right columns for the dataframe
newdf = df[['Strain','H2O','TotalTHC','THC', 'THCA', 'CBG', 'α-Pinene', 'Myrcene', 'β-Pinene', 'D-Limonene', 'Linalool', 'β-Caryophyllene',
'α-Humelene', 'α-Bisabolol']]

In [None]:
cleandf = newdf.dropna()

In [None]:
warnings.filterwarnings("ignore")
sns.set()
fig, ((ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10,ax11,ax12))=plt.subplots(ncols=12,nrows=1,figsize=(15,6)) 
sns.distplot(cleandf['H2O'],ax=ax1);
sns.distplot(cleandf['TotalTHC'],ax=ax2);            
sns.distplot(cleandf['THC'],ax=ax3);
sns.distplot(cleandf['THCA'],ax=ax4);
sns.distplot(cleandf['CBG'],ax=ax5);
sns.distplot(cleandf['α-Pinene'],ax=ax6);
sns.distplot(cleandf['Myrcene'],ax=ax7);
sns.distplot(cleandf['β-Pinene'],ax=ax8);
sns.distplot(cleandf['D-Limonene'],ax=ax9);
sns.distplot(cleandf['Linalool'],ax=ax10);
sns.distplot(cleandf['α-Humelene'],ax=ax11);
sns.distplot(cleandf['α-Bisabolol'],ax=ax12);

plt.savefig('1.png')

All the selected features are relatively normally distibuted.

In [None]:
#binning the strains on the basis of their moisture content
bins = [0, 4, 8, 11, 14]
labels = ['very low','low','high','very high']
cleandf['moisture content'] = pd.cut(cleandf['H2O'], bins=bins, labels=labels)

In [None]:
#showing the percentage of strains with different moisture contents in a pie chart
plt.figure(figsize=(10,10))
total = cleandf['moisture content'].value_counts().sum()
cleandf['moisture content'].value_counts().plot(kind='pie', autopct=lambda p:'{:.2f}% ({:.0f})'.format(p,p * total / 100))
plt.savefig('2.png')

In [None]:
#binning the strains on the basis of their Total THC content
bins = [0, 100, 200, 300, 400]
labels = ['0-100mg/g','100-200mg/g','200-300mg/g', '300-400mg/g']
cleandf['Total THC Content'] = pd.cut(cleandf['TotalTHC'], bins=bins, labels=labels)

In [None]:
#showing the percentage of strains with different Total THC contents in a pie chart
plt.figure(figsize=(10,10))
total = cleandf['Total THC Content'].value_counts().sum()
cleandf['Total THC Content'].value_counts().plot(kind='pie', autopct=lambda p:'{:.2f}% ({:.0f})'.format(p,p * total / 100))
plt.savefig('3.png')

In [None]:
#binning the strains on the basis of their α-Pinene content
bins = [0, 2, 4, 6]
labels = ['0-2mg/g','2-4mg/g','4-6mg/g']
cleandf['α-Pinene Content'] = pd.cut(cleandf['α-Pinene'], bins=bins, labels=labels)

In [None]:
#showing the percentage of strains with different α-Pinene contents in a pie chart
plt.figure(figsize=(10,10))
total = cleandf['α-Pinene Content'].value_counts().sum()
cleandf['α-Pinene Content'].value_counts().plot(kind='pie', autopct=lambda p:'{:.2f}% ({:.0f})'.format(p,p * total / 100))
plt.savefig('4.png')

In [None]:
sns.distplot(cleandf['Myrcene'])

In [None]:
#binning the strains on the basis of their α-Pinene content
bins = [0, 5, 10, 15, 20, 25]
labels = ['0-5mg/g','5-10mg/g','10-15mg/g', '15-20mg/g', '20-25mg/g']
cleandf['Myrcene Content'] = pd.cut(cleandf['Myrcene'], bins=bins, labels=labels)

In [None]:
#showing the percentage of strains with different Myrcene contents in a pie chart
plt.figure(figsize=(10,10))
total = cleandf['Myrcene Content'].value_counts().sum()
cleandf['Myrcene Content'].value_counts().plot(kind='pie', autopct=lambda p:'{:.2f}% ({:.0f})'.format(p,p * total / 100))
plt.savefig('5.png')

In [None]:
#binning the strains on the basis of their D-Limonene content
bins = [0, 2, 4, 6, 8]
labels = ['0-2mg/g','2-4mg/g','4-6mg/g', '6-8mg/g']
cleandf['D-Limonene Content'] = pd.cut(cleandf['D-Limonene'], bins=bins, labels=labels)

In [None]:
#showing the percentage of strains with different D-Limonene contents in a pie chart
plt.figure(figsize=(10,10))
total = cleandf['D-Limonene Content'].value_counts().sum()
cleandf['D-Limonene Content'].value_counts().plot(kind='pie', autopct=lambda p:'{:.2f}% ({:.0f})'.format(p,p * total / 100))
plt.savefig('6.png')

In [None]:
#binning the strains on the basis of their Linalool content
bins = [0, 2, 4, 6]
labels = ['0-2mg/g','2-4mg/g','4-6mg/g']
cleandf['Linalool Content'] = pd.cut(cleandf['Linalool'], bins=bins, labels=labels)

In [None]:
#showing the percentage of strains with different Linalool contents in a pie chart
plt.figure(figsize=(10,10))
total = cleandf['Linalool Content'].value_counts().sum()
cleandf['Linalool Content'].value_counts().plot(kind='pie', autopct=lambda p:'{:.2f}% ({:.0f})'.format(p,p * total / 100))
plt.savefig('7.png')

In [None]:
cleandf

In [None]:
#
plt.figure(figsize = (20, 12))
plt.plot(cleandf[['TotalTHC','THC','THCA', 'CBG']])
plt.xlabel("The ssample number")
plt.ylabel("Cannabinoid Concentration (mg/g)")
plt.legend(('TotalTHC', 'THC', 'THCA', 'THCA', 'CBG'))

In [None]:
cleandf['Strain'].value_counts()

In [None]:
plt.figure(figsize=(8,8))
group = (cleandf.groupby('Strain').mean()[['TotalTHC','THC','THCA','CBG']][0:1])
sns.barplot( data = group)\
.set_title("Cannabinoids in Double Sour D 'Mint'")

plt.figure(figsize=(8,8))
group = (cleandf.groupby('Strain').mean()[['α-Pinene','Myrcene','β-Pinene','D-Limonene', 
                                  'Linalool','β-Caryophyllene','α-Humelene','α-Bisabolol']][0:1])
sns.barplot( data = group)\
.set_title("Terpenes in Double Sour D 'Mint'")
plt.xticks(rotation=45)

In [None]:
plt.figure(figsize=(8,8))
group = (cleandf.groupby('Strain').mean()[['TotalTHC','THC','THCA','CBG']][1:2])
sns.barplot( data = group)\
.set_title("Cannabinoids in Acapulco Gold")

plt.figure(figsize=(8,8))
group = (cleandf.groupby('Strain').mean()[['α-Pinene','Myrcene','β-Pinene','D-Limonene', 
                                  'Linalool','β-Caryophyllene','α-Humelene','α-Bisabolol']][1:2])
sns.barplot( data = group)\
.set_title("Terpenes in Acapulco Gold")
plt.xticks(rotation=45)

In [None]:
plt.figure(figsize=(8,8))
group = (cleandf.groupby('Strain').mean()[['TotalTHC','THC','THCA','CBG']][2:3])
sns.barplot( data = group)\
.set_title("Cannabinoids in Alaskan Blackberry F2")

plt.figure(figsize=(8,8))
group = (cleandf.groupby('Strain').mean()[['α-Pinene','Myrcene','β-Pinene','D-Limonene', 
                                  'Linalool','β-Caryophyllene','α-Humelene','α-Bisabolol']][2:3])
sns.barplot( data = group)\
.set_title("Terpenes in Alaskan Blackberry F2")
plt.xticks(rotation=45)

In [None]:
mldf = cleandf[['Strain','H2O','TotalTHC','THC','THCA','CBG','α-Pinene','Myrcene','β-Pinene','D-Limonene','Linalool',
         'β-Caryophyllene','α-Humelene','α-Bisabolol']]

In [None]:
mldf.head()