[Python tutorial Pandas by Codebasics](https://www.youtube.com/watch?v=CmorAWRsCAw&list=PLeo1K3hjS3uuASpe-1LjfG5f14Bnozjwy)

In [1]:
import pandas as pd
df = pd.read_csv('nyc_weather.csv')
df.head(10)

Unnamed: 0,EST,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
0,1/1/2016,38,23,52,30.03,10,8.0,0,5,,281
1,1/2/2016,36,18,46,30.02,10,7.0,0,3,,275
2,1/3/2016,40,21,47,29.86,10,8.0,0,1,,277
3,1/4/2016,25,9,44,30.05,10,9.0,0,3,,345
4,1/5/2016,20,-3,41,30.57,10,5.0,0,0,,333
5,1/6/2016,33,4,35,30.5,10,4.0,0,0,,259
6,1/7/2016,39,11,33,30.28,10,2.0,0,3,,293
7,1/8/2016,39,29,64,30.2,10,4.0,0,8,,79
8,1/9/2016,44,38,77,30.16,9,8.0,T,8,Rain,76
9,1/10/2016,50,46,71,29.59,4,,1.8,7,Rain,109


## Questions

__1. What was the maximum temperature in new york in the month of January?__

In [2]:
df['Temperature'].max()

50

 __2. On which days did it rain?__
 
We need 2 conditons : 1st we have to retrieve that date and 2nd the event rain.

In [3]:
df['EST'][df['Events'] == 'Rain']

8      1/9/2016
9     1/10/2016
15    1/16/2016
26    1/27/2016
Name: EST, dtype: object

__3. What is the average wind speed?__

__NaN -__ Missing/blank data points

__Data Munging/ Data Wrangling -__ Process of data cleaning

In [4]:
# This has been done by neglecting the 'NaN points'.
df['WindSpeedMPH'].mean()

6.892857142857143

These missing/NaN points can be filled using __fillna()__. Use the keyword __

In [5]:
df.fillna(0, inplace = True)
df['WindSpeedMPH'].mean()

6.225806451612903

# DataFrame 
It is a main object of pandas. It is used to represent tabular data (with rows and columns). 

In [2]:
weather_df = pd.read_csv('weather_data.csv')
weather_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [3]:
# Dataframe can also be created from a dictionary
weather_data2 = {
    'day': ['1/1/2017','1/2/2017','1/3/2017','1/4/2017','1/5/2017','1/6/2017'],
    'temperature': [32,35,28,24,32,31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow','Snow','Rain', 'Sunny']
}
weather_data2

{'day': ['1/1/2017',
  '1/2/2017',
  '1/3/2017',
  '1/4/2017',
  '1/5/2017',
  '1/6/2017'],
 'event': ['Rain', 'Sunny', 'Snow', 'Snow', 'Rain', 'Sunny'],
 'temperature': [32, 35, 28, 24, 32, 31],
 'windspeed': [6, 7, 2, 7, 4, 2]}

In [4]:
weather_df = pd.DataFrame(weather_data2)
weather_df

Unnamed: 0,day,event,temperature,windspeed
0,1/1/2017,Rain,32,6
1,1/2/2017,Sunny,35,7
2,1/3/2017,Snow,28,2
3,1/4/2017,Snow,24,7
4,1/5/2017,Rain,32,4
5,1/6/2017,Sunny,31,2


In [5]:
# Shape/ dimensions of dataframe
weather_df.shape

(6, 4)

In [6]:
# saving the tuple
# Dont use df.shape()  . . .as it will return a tuple and individual tuple elements are not callable 
(rows, columns) = weather_df.shape
columns

4

In [7]:
# To print selected rows
weather_df[2:5]

Unnamed: 0,day,event,temperature,windspeed
2,1/3/2017,Snow,28,2
3,1/4/2017,Snow,24,7
4,1/5/2017,Rain,32,4


In [8]:
# To print column names
weather_df.columns

Index(['day', 'event', 'temperature', 'windspeed'], dtype='object')

In [13]:
# To print individual columns
weather_df.day

0    1/1/2017
1    1/2/2017
2    1/3/2017
3    1/4/2017
4    1/5/2017
5    1/6/2017
Name: day, dtype: object

__ OR , Alternately__

In [14]:
weather_df['day']   # same as weather_df.day

0    1/1/2017
1    1/2/2017
2    1/3/2017
3    1/4/2017
4    1/5/2017
5    1/6/2017
Name: day, dtype: object

In [15]:
type(weather_df['day'])  # Columns in datframe are of type series

pandas.core.series.Series

In [16]:
# To print just a few columns
# hELPS IN OUR ANALYSIS IF WE NEED ONLY A FEW COLUMNS
weather_df[['day','event','temperature']]

Unnamed: 0,day,event,temperature
0,1/1/2017,Rain,32
1,1/2/2017,Sunny,35
2,1/3/2017,Snow,28
3,1/4/2017,Snow,24
4,1/5/2017,Rain,32
5,1/6/2017,Sunny,31


In [17]:
# df.describe() will print statistics on the numerical columns
weather_df.describe()

Unnamed: 0,temperature,windspeed
count,6.0,6.0
mean,30.333333,4.666667
std,3.829708,2.33809
min,24.0,2.0
25%,28.75,2.5
50%,31.5,5.0
75%,32.0,6.75
max,35.0,7.0


In [18]:
# Conditions
weather_df[weather_df.temperature > 30]

Unnamed: 0,day,event,temperature,windspeed
0,1/1/2017,Rain,32,6
1,1/2/2017,Sunny,35,7
4,1/5/2017,Rain,32,4
5,1/6/2017,Sunny,31,2


In [19]:
# 2 conditions
weather_df[(weather_df.temperature > 30) & (weather_df.windspeed>=4)]

Unnamed: 0,day,event,temperature,windspeed
0,1/1/2017,Rain,32,6
1,1/2/2017,Sunny,35,7
4,1/5/2017,Rain,32,4


In [20]:
## IMP
weather_df[weather_df.temperature == weather_df.temperature.max()]

Unnamed: 0,day,event,temperature,windspeed
1,1/2/2017,Sunny,35,7


In [21]:
# To just print the specific column here day in which temp was max
weather_df['day'][weather_df.temperature == weather_df.temperature.max()]

1    1/2/2017
Name: day, dtype: object

In [22]:
# To view 2 columns
# the names of the 2 columns must be in a square bracket i.e., use 2 brackets
weather_df[['day','temperature']][weather_df.temperature == weather_df.temperature.max()]

Unnamed: 0,day,temperature
1,1/2/2017,35


## set_index()

In [23]:
weather_df

Unnamed: 0,day,event,temperature,windspeed
0,1/1/2017,Rain,32,6
1,1/2/2017,Sunny,35,7
2,1/3/2017,Snow,28,2
3,1/4/2017,Snow,24,7
4,1/5/2017,Rain,32,4
5,1/6/2017,Sunny,31,2


In [24]:
# To see the index of the dataframe
weather_df.index

RangeIndex(start=0, stop=6, step=1)

In [25]:
# To change the index of the dataframe . . .to the day column
# This command returns a new dataframe it does not modify the original dataframe.
weather_df.set_index('day')

Unnamed: 0_level_0,event,temperature,windspeed
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,Rain,32,6
1/2/2017,Sunny,35,7
1/3/2017,Snow,28,2
1/4/2017,Snow,24,7
1/5/2017,Rain,32,4
1/6/2017,Sunny,31,2


In [26]:
weather_df

Unnamed: 0,day,event,temperature,windspeed
0,1/1/2017,Rain,32,6
1,1/2/2017,Sunny,35,7
2,1/3/2017,Snow,28,2
3,1/4/2017,Snow,24,7
4,1/5/2017,Rain,32,4
5,1/6/2017,Sunny,31,2


So when we again print __weather_df__ we still have the numerical index from 0 to 5.
That is no change has occured. As the original dataframe has not been modified. 

So to modify the original weather_df use the parameter __inplace = True__.
So date can be used as index

In [27]:
weather_df.set_index('day', inplace = True)
weather_df

Unnamed: 0_level_0,event,temperature,windspeed
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,Rain,32,6
1/2/2017,Sunny,35,7
1/3/2017,Snow,28,2
1/4/2017,Snow,24,7
1/5/2017,Rain,32,4
1/6/2017,Sunny,31,2


In [28]:
weather_df.loc['1/5/2017']

event          Rain
temperature      32
windspeed         4
Name: 1/5/2017, dtype: object

To reset index to original one then use __df.reset_index(inplace=True)__

In [29]:
weather_df.reset_index(inplace=True)
weather_df

Unnamed: 0,day,event,temperature,windspeed
0,1/1/2017,Rain,32,6
1,1/2/2017,Sunny,35,7
2,1/3/2017,Snow,28,2
3,1/4/2017,Snow,24,7
4,1/5/2017,Rain,32,4
5,1/6/2017,Sunny,31,2


In [30]:
weather_df.set_index('event', inplace = True)
#If error occurs 'KeyError' reset index and try again
weather_df

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2017,32,6
Sunny,1/2/2017,35,7
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7
Rain,1/5/2017,32,4
Sunny,1/6/2017,31,2


In [31]:
# Observe that there are 2 rows with index 'snow' 
weather_df.loc['Snow']

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7
