<a href="https://colab.research.google.com/github/jharris121/AcademyPublic/blob/main/Session3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Session 3 - Data Visulisation

!pip install -q kaggle
from google.colab import files
files.upload()
#create a kaggle folder
!mkdir ~/.kaggle
# Go on kaggle > Account > Create New API token
# Save the json file in your laptop in a dedicated folder
# copy the kaggle.json to folder created
!cp kaggle.json ~/.kaggle
#permission for the json to act
!chmod 600 ~/.kaggle/kaggle.json
# Datasets available here: 
# https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009
!kaggle datasets download -d uciml/red-wine-quality-cortez-et-al-2009
!unzip red-wine-quality-cortez-et-al-2009.zip

import pandas as pd # Here I import a library named pandas which is specialized in
# data manipulation. I give an alias pd it is a kind of pseudonyme where pd refers to pandas.
import numpy as np
df = pd.read_csv('/content/winequality-red.csv')


Saving kaggle.json to kaggle.json
Downloading red-wine-quality-cortez-et-al-2009.zip to /content
  0% 0.00/25.6k [00:00<?, ?B/s]
100% 25.6k/25.6k [00:00<00:00, 11.2MB/s]
Archive:  red-wine-quality-cortez-et-al-2009.zip
  inflating: winequality-red.csv     


In [3]:
# Now let's complexify the database in order to have additional work to do later! Not important to understand
# Educational purpose

# Here I create a binary feature who will be used as target
# We use the numpy.where function which is equivalent to a if-then-else statement
# If df['quality'] >= 7 then df['Y']=1, else df['Y']=0
df['Y'] = np.where(df['quality']>= 7, 1, 0)

# We create as well some binary features of 'fixed acidity' & 'volatile acidity'
df['fixed_acidity_cat'] = np.where(df['fixed acidity']<= 8, 0, 1)
df['volatile_acidity_cat'] = np.where(df['volatile acidity']>= 0.4, 0, 1)

# Here I create some duplicates rows: I do a random sample of df and then I concat them at the end of df
temp = df.sample(n=150, random_state=893717398)
df = df.append(temp)
del(temp)
df.reset_index(drop=True, inplace=True)

# Here I create a new feature pH2 from pH where I had some missing values
df['pH2'] = np.where(df['pH'].index %15 ==0, np.nan, df['pH'])

# Here I create a new random feature 'acidity_other' from 'fixed acidity' and I create some missing values
rng = np.random.default_rng(893717398)
df = df.join(pd.Series(rng.normal(df['fixed acidity'].median(),1,len(df)), name='acidity_other'))
df['acidity_other'] = np.where(df['acidity_other'].index %2 !=0, np.nan, df['pH'])

#Here it is a function that will automatically create a feature of random dates between a defined interval
def random_datetimes_or_dates(start, end, out_format='datetime', n=10): 

    '''   
    unix timestamp is in ns by default. 
    I divide the unix time value by 10**9 to make it seconds 
    (or 24*60*60*10**9 to make it days).
    The corresponding unit variable is passed to 
    the pd.to_datetime function. 
    Values for the (divide_by, unit) pair to select is defined by 
    the out_format parameter.
    for 1 -> out_format='datetime'
    for 2 -> out_format=anything else
    '''
    (divide_by, unit) = (10**9, 's') if out_format=='datetime' else (24*60*60*10**9, 'D')

    start_u = start.value//divide_by
    end_u = end.value//divide_by

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit=unit)

# Now I apply my function in order to create the new feature 'date'
np.random.seed(893717398)
d_start = pd.to_datetime('2021-01-01')
d_end = pd.to_datetime('2021-06-30')
df['date'] = random_datetimes_or_dates(d_start, d_end, out_format='datetime', n=len(df))
df['date'] = df['date'].astype(str)

## End of the complexification



In [5]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Y,fixed_acidity_cat,volatile_acidity_cat,pH2,acidity_other,date
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,0,0,,3.51,2021-01-06 14:38:36
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,0,0,3.2,,2021-02-07 08:11:30
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,0,0,3.26,3.26,2021-06-27 12:25:26
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,1,1,3.16,,2021-06-16 05:25:38
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,0,0,3.51,3.51,2021-01-20 10:10:16


In [6]:
df.shape

(1749, 18)

In [7]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
Y                         int64
fixed_acidity_cat         int64
volatile_acidity_cat      int64
pH2                     float64
acidity_other           float64
date                     object
dtype: object

In [11]:
def citric_acid_cat(x):
  if x < 0.21 :
    return 'A'
  elif x < 0.3 :
    return 'B'
  else:
    return 'C'

# Now we want to create a new feature citric_cat, which transform the numerical value
# of citric acid to the category defined above for each line of the feature.
# We could create a for loop. However the apply lambda is more efficient from a computing point of view 
# The x refers to the x of the function which is df['citric acid']
df['citric_cat'] = df['citric acid'].apply(lambda x: citric_acid_cat(x))


In [12]:
df['citric_cat'].describe()

count     1749
unique       3
top          C
freq       773
Name: citric_cat, dtype: object

In [13]:
df['citric_cat']=np.where(df['citric_cat'] == 'C', 'Z', df['citric_cat'])

In [14]:
df['citric_cat'].value_counts()

Z    773
A    693
B    283
Name: citric_cat, dtype: int64

In [16]:
np.round(df['citric_cat'].value_counts(True)*100,2)

Z    44.20
A    39.62
B    16.18
Name: citric_cat, dtype: float64

In [19]:
df.isna().sum(1)

0       1
1       1
2       0
3       1
4       0
       ..
1744    0
1745    1
1746    0
1747    1
1748    0
Length: 1749, dtype: int64

In [21]:
df['date'].head(3)
df['date']=pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')

In [22]:
df.dtypes

fixed acidity                  float64
volatile acidity               float64
citric acid                    float64
residual sugar                 float64
chlorides                      float64
free sulfur dioxide            float64
total sulfur dioxide           float64
density                        float64
pH                             float64
sulphates                      float64
alcohol                        float64
quality                          int64
Y                                int64
fixed_acidity_cat                int64
volatile_acidity_cat             int64
pH2                            float64
acidity_other                  float64
date                    datetime64[ns]
citric_cat                      object
dtype: object

In [23]:
today = pd.to_datetime('2021-10-12', format = '%Y-%m-%d')
today
df['time_diff'] = today -df['date']
df['time_diff'].head(3)

0   278 days 09:21:24
1   246 days 15:48:30
2   106 days 11:34:34
Name: time_diff, dtype: timedelta64[ns]

In [24]:
df.dtypes

fixed acidity                   float64
volatile acidity                float64
citric acid                     float64
residual sugar                  float64
chlorides                       float64
free sulfur dioxide             float64
total sulfur dioxide            float64
density                         float64
pH                              float64
sulphates                       float64
alcohol                         float64
quality                           int64
Y                                 int64
fixed_acidity_cat                 int64
volatile_acidity_cat              int64
pH2                             float64
acidity_other                   float64
date                     datetime64[ns]
citric_cat                       object
time_diff               timedelta64[ns]
dtype: object

In [26]:
df['day_diff']=(df['time_diff']/np.timedelta64(1, 'D')).astype(float)
df['day_diff']

0       278.389861
1       246.658681
2       106.482338
3       117.773866
4       264.576204
           ...    
1744    229.515891
1745    122.306586
1746    130.300637
1747    223.381343
1748    205.710289
Name: day_diff, Length: 1749, dtype: float64

In [None]:
##### Worked up to here then lost track 

#np.random.seed(8937173981)
#randomf1 = pd.series




In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D