# References:
  * Titanic Dataset: [https://www.kaggle.com/competitions/titanic/overview](https://www.kaggle.com/competitions/titanic/overview)


# 0- Import libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# 1- Detect and process null values

## 1.1- Question 1: count number of null values for each column
* Information:
  * ```df```: a dataframe

* Requirements:
  1. Count number of null and not-null values for each column and show the results as in the output cell

In [8]:
#create DataFrame with some NaN values
df = pd.DataFrame({'rating': [np.nan, 85, np.nan, 88, 94, 90, 76, 75, 87, 86],
                   'points': [25, np.nan, 14, 16, 27, 20, 12, 15, 14, 19],
                   'assists': [5, 7, 7, np.nan, 5, 7, 6, 9, 9, 5],
                   'rebounds': [11, 8, 10, 6, 6, 9, 6, 10, 10, 7]})

print("DataFrame (df):")
print("-"*50)
print(df)

print("="*100)
print("Counts of null values for each columns:")
print(df.isnull().sum())

print("Counts of not-null values for each columns:")
print(df.notna().sum())

DataFrame (df):
--------------------------------------------------
   rating  points  assists  rebounds
0     NaN    25.0      5.0        11
1    85.0     NaN      7.0         8
2     NaN    14.0      7.0        10
3    88.0    16.0      NaN         6
4    94.0    27.0      5.0         6
5    90.0    20.0      7.0         9
6    76.0    12.0      6.0         6
7    75.0    15.0      9.0        10
8    87.0    14.0      9.0        10
9    86.0    19.0      5.0         7
Counts of null values for each columns:
rating      2
points      1
assists     1
rebounds    0
dtype: int64
Counts of not-null values for each columns:
rating       8
points       9
assists      9
rebounds    10
dtype: int64


## 1.2- Question 2: **dropna**
* Information:
  * ```df```: a dataframe

* Requirements:
  1. Drop columns or rows that contain null values
  2. Show the results as in the output cell

In [11]:
#create DataFrame with some NaN values
df = pd.DataFrame({'rating': [np.nan, 85, np.nan, 88, 94, 90, 76, 75, 87, 86],
                   'points': [25, np.nan, 14, 16, 27, 20, 12, 15, 14, 19],
                   'assists': [5, 7, 7, np.nan, 5, 7, 6, 9, 9, 5],
                   'rebounds': [11, 8, 10, 6, 6, 9, 6, 10, 10, 7]})

print("DataFrame (df):")
print("-"*50)
print(df)

print("="*100)

print("IF dropping columns that contains null values, df.dropna(axis=1)")
print(df.dropna(axis=1))

print("-"*50)
print("IF dropping rows that contains null values, df.dropna(axis=0)")
print(df.dropna(axis=0))


DataFrame (df):
--------------------------------------------------
   rating  points  assists  rebounds
0     NaN    25.0      5.0        11
1    85.0     NaN      7.0         8
2     NaN    14.0      7.0        10
3    88.0    16.0      NaN         6
4    94.0    27.0      5.0         6
5    90.0    20.0      7.0         9
6    76.0    12.0      6.0         6
7    75.0    15.0      9.0        10
8    87.0    14.0      9.0        10
9    86.0    19.0      5.0         7
IF dropping columns that contains null values, df.dropna(axis=1)
   rebounds
0        11
1         8
2        10
3         6
4         6
5         9
6         6
7        10
8        10
9         7


## 1.3- Question 3: **fillna**
* Requirements:
  1. Replace all the null values with the same value, for example, 0 or here -999
  2. Replace null values with their previous value along columns (axis=0). Try all the following versions:
    * ```fillna(method='ffill')```
    * ```ffill()```
  3. Replace null values with their next value along columns (axis=0). Try all the following versions:
    * ```fillna(method='bfill')```
    * ```bfill()```
    * ```backfill()```
  4. Show the results as in the output cell
  5. Do tasks 2 and 3 along rows (axis=1)


In [13]:
#create DataFrame with some NaN values
df = pd.DataFrame({'rating': [np.nan, 85, np.nan, 88, 94, 90, 76, 75, 87, 86],
                   'points': [25, np.nan, 14, 16, 27, 20, 12, 15, 14, 19],
                   'assists': [5, 7, 7, np.nan, 5, 7, 6, 9, 9, 5],
                   'rebounds': [11, 8, 10, 6, 6, 9, 6, 10, 10, 7]})

print("DataFrame (df):")
print("-"*50)
print(df)

print("="*100)
print("IF replacing null values with a value, e.g. -999, df.fillna(-999):")
print(df.fillna(-999))

print("-"*50)
print("IF replacing null values with previous value in columns, df.fillna(method='ffill'):")
print(df.fillna(method='ffill'))

print("-"*50)
print("IF replacing null values with previous value in columns, df.fillna(method='bfill'):")
print(df.fillna(method='bfill'))



DataFrame (df):
--------------------------------------------------
   rating  points  assists  rebounds
0     NaN    25.0      5.0        11
1    85.0     NaN      7.0         8
2     NaN    14.0      7.0        10
3    88.0    16.0      NaN         6
4    94.0    27.0      5.0         6
5    90.0    20.0      7.0         9
6    76.0    12.0      6.0         6
7    75.0    15.0      9.0        10
8    87.0    14.0      9.0        10
9    86.0    19.0      5.0         7
IF replacing null values with a value, e.g. -999, df.fillna(-999):
   rating  points  assists  rebounds
0  -999.0    25.0      5.0        11
1    85.0  -999.0      7.0         8
2  -999.0    14.0      7.0        10
3    88.0    16.0   -999.0         6
4    94.0    27.0      5.0         6
5    90.0    20.0      7.0         9
6    76.0    12.0      6.0         6
7    75.0    15.0      9.0        10
8    87.0    14.0      9.0        10
9    86.0    19.0      5.0         7
--------------------------------------------------
I

  print(df.fillna(method='ffill'))
  print(df.fillna(method='bfill'))


## 1.4- Question 4: **fillna**
* Requirements:
  1. Replace each null value with the mean of its column
  2. Show the result as in the output cell

In [37]:
#create DataFrame with some NaN values
df = pd.DataFrame({'rating': [np.nan, 85, np.nan, 88, 94, 90, 76, 75, 87, 86],
                   'points': [25, np.nan, 14, 16, 27, 20, 12, 15, 14, 19],
                   'assists': [5, 7, 7, np.nan, 5, 7, 6, 9, 9, 5],
                   'rebounds': [11, 8, 10, 6, 6, 9, 6, 10, 10, 7]})

print("DataFrame (df):")
print("-"*50)
print(df)
# YOUR CODE IS HERE
mean_value = np.mean(df, axis=0)
print(mean_value)
df.fillna(mean_value, inplace=True)

print("="*100)
print("IF replacing null values with the mean of corresponding column, df.fillna(df.mean()):")
print(df)


DataFrame (df):
--------------------------------------------------
   rating  points  assists  rebounds
0     NaN    25.0      5.0        11
1    85.0     NaN      7.0         8
2     NaN    14.0      7.0        10
3    88.0    16.0      NaN         6
4    94.0    27.0      5.0         6
5    90.0    20.0      7.0         9
6    76.0    12.0      6.0         6
7    75.0    15.0      9.0        10
8    87.0    14.0      9.0        10
9    86.0    19.0      5.0         7
rating      85.125000
points      18.000000
assists      6.666667
rebounds     8.300000
dtype: float64
IF replacing null values with the mean of corresponding column, df.fillna(df.mean()):
   rating  points   assists  rebounds
0  85.125    25.0  5.000000        11
1  85.000    18.0  7.000000         8
2  85.125    14.0  7.000000        10
3  88.000    16.0  6.666667         6
4  94.000    27.0  5.000000         6
5  90.000    20.0  7.000000         9
6  76.000    12.0  6.000000         6
7  75.000    15.0  9.000000      

## 1.5- Question 5: **fillna**
* Requirements:
  1. Replace each null value with the mean of its row
  2. Show the result as in the output cell

In [39]:
#create DataFrame with some NaN values
df = pd.DataFrame({'rating': [np.nan, 85, np.nan, 88, 94, 90, 76, 75, 87, 86],
                   'points': [25, np.nan, 14, 16, 27, 20, 12, 15, 14, 19],
                   'assists': [5, 7, 7, np.nan, 5, 7, 6, 9, 9, 5],
                   'rebounds': [11, 8, 10, 6, 6, 9, 6, 10, 10, 7]})

print("DataFrame (df):")
print("-"*50)
print(df)

print("="*100)
print("IF replacing null values with the mean of corresponding row, (df.T) = transpose:")
transpose_df = df.T
transpose_df.fillna(transpose_df.mean(axis=0), inplace=True)
print(transpose_df.T)




DataFrame (df):
--------------------------------------------------
   rating  points  assists  rebounds
0     NaN    25.0      5.0        11
1    85.0     NaN      7.0         8
2     NaN    14.0      7.0        10
3    88.0    16.0      NaN         6
4    94.0    27.0      5.0         6
5    90.0    20.0      7.0         9
6    76.0    12.0      6.0         6
7    75.0    15.0      9.0        10
8    87.0    14.0      9.0        10
9    86.0    19.0      5.0         7
IF replacing null values with the mean of corresponding row, (df.T) = transpose:
      rating     points    assists  rebounds
0  13.666667  25.000000   5.000000      11.0
1  85.000000  33.333333   7.000000       8.0
2  10.333333  14.000000   7.000000      10.0
3  88.000000  16.000000  36.666667       6.0
4  94.000000  27.000000   5.000000       6.0
5  90.000000  20.000000   7.000000       9.0
6  76.000000  12.000000   6.000000       6.0
7  75.000000  15.000000   9.000000      10.0
8  87.000000  14.000000   9.000000      

`fillna` function usually replace the value within the column label, this is also the reason which it only works with mean of columns not mean of rows. So when we calculate the mean of columns, it returns something like 
```
rating      85.125000
points      18.000000
assists      6.666667
rebounds     8.300000
```

Which then can easily map to the DataFrame like column 'rating', we have 85. and so on so forth 

But with mean of rows, it returns something like 
```
0    13.666667
1    33.333333
2    10.333333
3    36.666667
4    33.000000
5    31.500000
6    25.000000
7    27.250000
8    30.000000
9    29.250000
```

which can not map with column label


## 1.6- Question 6: **fillna**
* Requirements:
  1. Replace each null value with the mean of its column, using only values for its group (target)
  
  2. Show the result as in the output cell

In [59]:
#create DataFrame with some NaN values
df = pd.DataFrame({
    "x1":     [None, 3,   3,    np.nan, 4,     2,   None,   3],
    "x2":     [3,    1,   1,    2,      None,  2,   None,   3],
    "x3":     [4,    2,   2,    4,      3,     3,   2,      2],
    "target": [0,    0,   0,    0,      1,     1,   1,      1]
}) 
print("DataFrame (df):")
print("-"*50)
print(df)

df_filled = df.copy()
cols = ["x1", "x2", "x3"]
df_filled.groupby("target")[cols]
df_filled[cols] = df_filled.groupby("target")[cols].transform(lambda s: s.fillna(s.mean()))

print("="*100)
print("IF replacing null value with its mean, computing from its group (target)")
print(df_filled)

DataFrame (df):
--------------------------------------------------
    x1   x2  x3  target
0  NaN  3.0   4       0
1  3.0  1.0   2       0
2  3.0  1.0   2       0
3  NaN  2.0   4       0
4  4.0  NaN   3       1
5  2.0  2.0   3       1
6  NaN  NaN   2       1
7  3.0  3.0   2       1
IF replacing null value with its mean, computing from its group (target)
    x1   x2  x3  target
0  3.0  3.0   4       0
1  3.0  1.0   2       0
2  3.0  1.0   2       0
3  3.0  2.0   4       0
4  4.0  2.5   3       1
5  2.0  2.0   3       1
6  3.0  2.5   2       1
7  3.0  3.0   2       1


# 2- Add and remove columns

## 2.1- Question 1
* Information:
  * ```df```: a dataframe

* Requirements:
  1. Add a column ```f1_score``` - computed from two other columns, as follows:
$$
\text{f\_score} = 2 \times \frac{\text{precision} \times \text{recall}}{\text{precision} + \text{recall}}
$$

  2. Drop column ```counts```
  3. Show the results as in the output cell

In [67]:
#create DataFrame with some NaN values
df = pd.DataFrame({
      "precision": [0.8, 0.9, 0.5, 0.6, 0.95],
      "recall":    [0.9, 0.9, 0.8, 0.7, 0.8],
      "counts":    [100, 500, 200, 250, 400],
  },
  index = ["Dog", "Cat", "Chicken", "Horse", "Sheep"]) 

print("DataFrame (df):")
print("-"*50)
print(df)
print()
print("="*100)

upper_value = df["precision"] * df["recall"] 
lower_value = df["precision"] + df["recall"]
f1_score = 2 * (upper_value / lower_value)
df["f1_score"] = f1_score

print("After adding column 'f1_score':")
print(df)

print("="*100)
print("After dropping columns 'counts':")
print(df.drop("counts", axis=1))



DataFrame (df):
--------------------------------------------------
         precision  recall  counts
Dog           0.80     0.9     100
Cat           0.90     0.9     500
Chicken       0.50     0.8     200
Horse         0.60     0.7     250
Sheep         0.95     0.8     400

After adding column 'f1_score':
         precision  recall  counts  f1_score
Dog           0.80     0.9     100  0.847059
Cat           0.90     0.9     500  0.900000
Chicken       0.50     0.8     200  0.615385
Horse         0.60     0.7     250  0.646154
Sheep         0.95     0.8     400  0.868571
After dropping columns 'counts':
         precision  recall  f1_score
Dog           0.80     0.9  0.847059
Cat           0.90     0.9  0.900000
Chicken       0.50     0.8  0.615385
Horse         0.60     0.7  0.646154
Sheep         0.95     0.8  0.868571


## 2.2- Question 2
* Information:
  * ```df```: a dataframe

* Requirements:
  1. Compute **precision**, **recall** and **f1_score** for all classes, i.e., *'Dog', 'Cat', 'Chicken', 'Horse', and 'Sheep'* by computing **unweighted mean** from colums ```precision```, ```recall```, and ```f1_score```.

  2. Compute **precision**, **recall** and **f1_score** for all classes, i.e., *'Dog', 'Cat', 'Chicken', 'Horse', and 'Sheep'* by computing **weighted mean** from colums ```precision```, ```recall```, and ```f1_score```; where, 
  $\text{weights} = \text{counts/counts.sum()}$
  3. Show the results as in the output cell


* Guidelines:
  * use ```df.multiply(weights, axis=0)```

In [77]:
#create DataFrame with some NaN values
df = pd.DataFrame({
      "precision": [0.8, 0.9, 0.5, 0.6, 0.95],
      "recall":    [0.9, 0.9, 0.8, 0.7, 0.8],
      "counts":    [100, 500, 200, 250, 400],
  },
  index = ["Dog", "Cat", "Chicken", "Horse", "Sheep"]) 

print("DataFrame (df):")
print("-"*50)
print(df)
print()
print("="*100)
# YOUR CODE IS HERE

upper_value = df["precision"] * df["recall"] 
lower_value = df["precision"] + df["recall"]
f1_score = 2 * (upper_value / lower_value)
df["f1_score"] = f1_score

weight_value = df["counts"] / df["counts"].sum()

df.drop("counts", axis=1, inplace=True)
print("Unweighted mean:")
print(df.mean())

weight_df = df.multiply(weight_value, axis=0)
print("="*100)
print("Weighted mean:")
print(weight_df.sum() / weight_value.sum())

DataFrame (df):
--------------------------------------------------
         precision  recall  counts
Dog           0.80     0.9     100
Cat           0.90     0.9     500
Chicken       0.50     0.8     200
Horse         0.60     0.7     250
Sheep         0.95     0.8     400

Unweighted mean:
precision    0.750000
recall       0.820000
f1_score     0.775434
dtype: float64
Weighted mean:
precision    0.800000
recall       0.824138
f1_score     0.804655
dtype: float64
