In [2]:
import pandas as pd
df = pd.read_csv('datasets/chap3/nyc_temperatures.csv')
df.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-10-01T00:00:00,TAVG,GHCND:USW00014732,"H,,S,",21.2
1,2018-10-01T00:00:00,TMAX,GHCND:USW00014732,",,W,2400",25.6
2,2018-10-01T00:00:00,TMIN,GHCND:USW00014732,",,W,2400",18.3
3,2018-10-02T00:00:00,TAVG,GHCND:USW00014732,"H,,S,",22.7
4,2018-10-02T00:00:00,TMAX,GHCND:USW00014732,",,W,2400",26.1


In [3]:
# Renaming Columns
df.columns
df.rename(columns={"value":"temp_c" , "attributes":"flags"})
df.columns

Index(['date', 'datatype', 'station', 'attributes', 'value'], dtype='object')

In [5]:
# using rename
df.rename(str.capitalize, axis='columns', inplace=True)

In [7]:
# Changing data types in colums
df.columns
df.dtypes
df.loc[:,'Date'] = pd.to_datetime(df['Date'])


  df.loc[:,'Date'] = pd.to_datetime(df['Date'])


In [8]:
# this one is treated as string 
df.Date.describe()
# if we want to treated as numeric and get results in numeric
df.Date.describe(datetime_is_numeric=True)

  df.Date.describe()


count                     93
mean     2018-10-16 00:00:00
min      2018-10-01 00:00:00
25%      2018-10-08 00:00:00
50%      2018-10-16 00:00:00
75%      2018-10-24 00:00:00
max      2018-10-31 00:00:00
Name: Date, dtype: object

In [21]:
# with rename we can handle any type converstion by passing the column as named parameters and their new values 
# as the value of the argument to the method call
df = pd.read_csv('datasets/chap3/nyc_temperatures.csv').rename(columns={'value': 'temp_C', 'attributes': 'flags'})

new_df = df.assign(
    date = pd.to_datetime(df['date']),
    temp_f = (df['temp_C']* 9/5) + 32
)
new_df.head(2)

Unnamed: 0,date,datatype,station,flags,temp_C,temp_f
0,2018-10-01,TAVG,GHCND:USW00014732,"H,,S,",21.2,70.16
1,2018-10-01,TMAX,GHCND:USW00014732,",,W,2400",25.6,78.08


In [10]:
df = df.assign(
    # assigns each value of the column to the date
date=lambda x: pd.to_datetime(x.date),
temp_C_whole=lambda x: x.temp_C.astype('int'),
temp_F=lambda x: (x.temp_C * 9/5) + 32,
temp_F_whole=lambda x: x.temp_F.astype('int'))

In [11]:
# using the category type
df_with_categories =   df.assign(
    station = df.station.astype("category"), 
    datatype = df.datatype.astype("category")
)
df_with_categories
df_with_categories.dtypes

date            datetime64[ns]
datatype              category
station               category
flags                   object
temp_C                 float64
temp_C_whole             int64
temp_F                 float64
temp_F_whole             int64
dtype: object

In [12]:
# Describing Categorical Data
df_with_categories.describe(include="category")

Unnamed: 0,datatype,station
count,93,93
unique,3,1
top,TAVG,GHCND:USW00014732
freq,31,93


In [20]:
# sorting Values
df[df.datatype == "TMAX"].sort_values(by='temp_C', ascending=False).head(2)

Unnamed: 0,date,datatype,station,flags,temp_C,temp_C_whole,temp_F,temp_F_whole
19,2018-10-07,TMAX,GHCND:USW00014732,",,W,2400",27.8,27,82.04,82
28,2018-10-10,TMAX,GHCND:USW00014732,",,W,2400",27.8,27,82.04,82


In [19]:
# sorting with multuple columns and
df[df.datatype == 'TMAX'].sort_values( by=['temp_C', 'date'], ascending=[False, True]).head(2)


Unnamed: 0,date,datatype,station,flags,temp_C,temp_C_whole,temp_F,temp_F_whole
19,2018-10-07,TMAX,GHCND:USW00014732,",,W,2400",27.8,27,82.04,82
28,2018-10-10,TMAX,GHCND:USW00014732,",,W,2400",27.8,27,82.04,82


In [18]:
# We can also sort by indexes or by columns
# Automatically index is 0 which measn that it is for the rows but 1 
# its for columns
df.sort_index(axis=1).head(2)

Unnamed: 0,datatype,date,flags,station,temp_C,temp_C_whole,temp_F,temp_F_whole
0,TAVG,2018-10-01,"H,,S,",GHCND:USW00014732,21.2,21,70.16,70
1,TMAX,2018-10-01,",,W,2400",GHCND:USW00014732,25.6,25,78.08,78


In [17]:
# setting the index
df.set_index('date').head(2)

Unnamed: 0_level_0,datatype,station,flags,temp_C,temp_C_whole,temp_F,temp_F_whole
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-10-01,TAVG,GHCND:USW00014732,"H,,S,",21.2,21,70.16,70
2018-10-01,TMAX,GHCND:USW00014732,",,W,2400",25.6,25,78.08,78


In [26]:
# Reindex method
sp = pd.read_csv('datasets/chap3/sp500.csv', index_col="date", parse_dates=True).drop(columns=["adj_close"])
sp

Unnamed: 0_level_0,high,low,open,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-03,2263.879883,2245.129883,2251.570068,2257.830078,3770530000
2017-01-04,2272.820068,2261.600098,2261.600098,2270.750000,3764890000
2017-01-05,2271.500000,2260.449951,2268.179932,2269.000000,3761820000
2017-01-06,2282.100098,2264.060059,2271.139893,2276.979980,3339890000
2017-01-09,2275.489990,2268.899902,2273.590088,2268.899902,3217610000
...,...,...,...,...,...
2018-12-24,2410.340088,2351.100098,2400.560059,2351.100098,2613930000
2018-12-26,2467.760010,2346.580078,2363.120117,2467.699951,4233990000
2018-12-27,2489.100098,2397.939941,2442.500000,2488.830078,4096610000
2018-12-28,2520.270020,2472.889893,2498.770020,2485.739990,3702620000
