# Handling Missing Data
***

In [2]:
import pandas as pd

### `parse_dates` function changes the str date values to time values in the indicated column.

In [3]:
dataframe = pd.read_csv("Excel and CSV Files/Ankara Weather Data.csv", sep = None, engine = "python", parse_dates = ["Date"])
dataframe.set_index("Date", inplace = True)
dataframe

Unnamed: 0_level_0,﻿Index,Windspeed,Event,Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,0.0,8.0,Cloudy,15.0
2021-02-01,1.0,8.0,Cloudy,15.0
2021-03-01,2.0,7.0,Cloudy,
2021-04-01,3.0,7.0,,16.0
2021-05-01,4.0,6.0,Cloudy,18.0
2021-06-01,5.0,6.0,,19.0
2021-07-01,6.0,,Sunny,20.0
2021-08-01,,,,
2021-09-01,8.0,5.0,Sunny,
2021-10-01,9.0,4.0,Sunny,


### `.fillna` function changes the NaN values to the desired value.

In [4]:
new_df = dataframe.fillna(0)
new_df

Unnamed: 0_level_0,﻿Index,Windspeed,Event,Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,0.0,8.0,Cloudy,15.0
2021-02-01,1.0,8.0,Cloudy,15.0
2021-03-01,2.0,7.0,Cloudy,0.0
2021-04-01,3.0,7.0,0,16.0
2021-05-01,4.0,6.0,Cloudy,18.0
2021-06-01,5.0,6.0,0,19.0
2021-07-01,6.0,0.0,Sunny,20.0
2021-08-01,0.0,0.0,0,0.0
2021-09-01,8.0,5.0,Sunny,0.0
2021-10-01,9.0,4.0,Sunny,0.0


### If wanted, by a help of a dictionary, certain values in the certain columns can be altered.

In [5]:
new_df = dataframe.fillna({
    "Temperature"   : 0,
    "Windspeed"     : 0,
    "Event"         : "No Event"
})
new_df

Unnamed: 0_level_0,﻿Index,Windspeed,Event,Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,0.0,8.0,Cloudy,15.0
2021-02-01,1.0,8.0,Cloudy,15.0
2021-03-01,2.0,7.0,Cloudy,0.0
2021-04-01,3.0,7.0,No Event,16.0
2021-05-01,4.0,6.0,Cloudy,18.0
2021-06-01,5.0,6.0,No Event,19.0
2021-07-01,6.0,0.0,Sunny,20.0
2021-08-01,,0.0,No Event,0.0
2021-09-01,8.0,5.0,Sunny,0.0
2021-10-01,9.0,4.0,Sunny,0.0


### bfill, fills the cells with the same values below them.
### ffill does the same thing with the above.

In [6]:
new_df = dataframe.fillna(method = "bfill")
new_df

Unnamed: 0_level_0,﻿Index,Windspeed,Event,Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,0.0,8.0,Cloudy,15.0
2021-02-01,1.0,8.0,Cloudy,15.0
2021-03-01,2.0,7.0,Cloudy,16.0
2021-04-01,3.0,7.0,Cloudy,16.0
2021-05-01,4.0,6.0,Cloudy,18.0
2021-06-01,5.0,6.0,Sunny,19.0
2021-07-01,6.0,5.0,Sunny,20.0
2021-08-01,8.0,5.0,Sunny,30.0
2021-09-01,8.0,5.0,Sunny,30.0
2021-10-01,9.0,4.0,Sunny,30.0


### Interpolate method(default is "Linear"), fills the NaN values according to the celles above and below them.

In [7]:
new_df = dataframe.interpolate(method = "time")
# new_df.fillna({'Event': 'no event'}, inplace = True)
new_df

Unnamed: 0_level_0,﻿Index,Windspeed,Event,Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,0.0,8.0,Cloudy,15.0
2021-02-01,1.0,8.0,Cloudy,15.0
2021-03-01,2.0,7.0,Cloudy,15.474576
2021-04-01,3.0,7.0,,16.0
2021-05-01,4.0,6.0,Cloudy,18.0
2021-06-01,5.0,6.0,,19.0
2021-07-01,6.0,5.673913,Sunny,20.0
2021-08-01,7.0,5.336957,,22.520325
2021-09-01,8.0,5.0,Sunny,25.04065
2021-10-01,9.0,4.0,Sunny,27.479675


### dropna method removes the rows according to the method given inside.

In [8]:
new_df = dataframe.dropna(thresh = 1)
new_df

Unnamed: 0_level_0,﻿Index,Windspeed,Event,Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,0.0,8.0,Cloudy,15.0
2021-02-01,1.0,8.0,Cloudy,15.0
2021-03-01,2.0,7.0,Cloudy,
2021-04-01,3.0,7.0,,16.0
2021-05-01,4.0,6.0,Cloudy,18.0
2021-06-01,5.0,6.0,,19.0
2021-07-01,6.0,,Sunny,20.0
2021-09-01,8.0,5.0,Sunny,
2021-10-01,9.0,4.0,Sunny,
2021-11-01,10.0,5.0,Sunny,30.0


### date_range and indexing changes the index by the method given inside.

In [9]:
dateRange = pd.date_range("01.01.2021", "21.01.2021")
idx = pd.DatetimeIndex(dateRange)
df = dataframe.reindex(idx)
df

Unnamed: 0,﻿Index,Windspeed,Event,Temperature
2021-01-01,0.0,8.0,Cloudy,15.0
2021-01-02,,,,
2021-01-03,,,,
2021-01-04,,,,
2021-01-05,,,,
2021-01-06,,,,
2021-01-07,,,,
2021-01-08,,,,
2021-01-09,,,,
2021-01-10,,,,


***
# Part 2 

In [12]:
import numpy as np

When creating, downloading or scraping data for dataframes, sometimes a missing datas occur in a special way.
<br>For example -9999 might be the placeholder for missing data. Or similiar numbers migt be as well.

In this situaions we can use `.replace()` method.
<br>It is used as `dataframe.replace(replaced, replacer)`
<br>We can use `np.NaN`

But what if we had two or more diffrent values?
<br>For example -888 too.
<br>We can commit the "replaced" paramter as a list: [-9999, -888]

If the value that we are replacing is being used in some columns, we might want to prevent that.
<br>We can use dictionaries,

In [16]:
newDF = dataframe.replace({
    'Temperature'   : -9999,
    'Event'         : 0,
    'Windspeed'     : -9999
}, np.NaN)

we can also use mapping to replace values.

In [None]:
newDF = dataframe.replace({
    -9999       :   np.NaN,
    'No Event'  :   'Sunny',
})

### What if the values in the dataframes had unit of measures?
For example Temperatures had F as a fahrenheit and Windspeed had a mph. 
<br>what we should is, chopping the units away.
Best way to do is using Regex(Regular Expressions).
Regex are used to recognize patterns:

In [17]:
newDf = dataframe.replace('[A-Za-z]','', regex = True)

Now if we do that every value in the Event column will be erased.
In order to prevent that we have to use dictionary:

In [19]:
newDf = dataframe.replace({
    'Temperature'   :   '[A-Za-z]',
    'Windspeed'     :   '[A-Za-z]'},'',
    
    regex = True)

In [20]:
df = pd.DataFrame({
    'score': ['exceptional','average', 'good', 'poor', 'average', 'exceptional'],
    'student': ['rob', 'maya', 'parthiv', 'tom', 'julian', 'erica']
})
df

Unnamed: 0,score,student
0,exceptional,rob
1,average,maya
2,good,parthiv
3,poor,tom
4,average,julian
5,exceptional,erica


If we wanted to replace the values that are string with numbers:
<br>First values are to be replaced; second values are the replacer.

In [21]:
df.replace(['poor', 'average', 'good', 'exceptional'], [1,2,3,4])

Unnamed: 0,score,student
0,4,rob
1,2,maya
2,3,parthiv
3,1,tom
4,2,julian
5,4,erica


***
# End of these parts