In [1]:
import pandas as pd
import numpy as np

In [2]:
df_0=pd.read_csv('weather.csv')
df_0

Unnamed: 0,DATE,TMIN,TMAX
0,2021-04-25,18,28
1,2021-04-26,16,23
2,2021-04-27,17,24
3,2021-04-28,15,25
4,2021-04-29,17,28


In [None]:
df_0 = pd.read_csv('weather.csv',parse_dates=['DATE'])
df_0

In [3]:
df_0.rename(columns={
    'DATE': 'date',
    'TMIN': 'min_temp',
    'TMAX': 'max_temp',
},inplace=True)
df_0.sample(2)

Unnamed: 0,date,min_temp,max_temp
2,2021-04-27,17,24
4,2021-04-29,17,28


In [4]:
df_1 = pd.read_csv('donations.csv')
df_1

Unnamed: 0,1. First Name,2. Last Name,3. Donation Amount
0,Amy,Wang,200
1,Bender,Rodriguez,12
2,Philip,Fry,70


In [7]:
import re

In [8]:
def fix_col(col):
    """Fix column name
    >>> fix_col('1. First Name')
    'first_name'
    """
    return (
        re.sub(r'\d+\.\s+','', col)
        .lower()
        .replace(' ', '_')
    )

In [9]:
df_1.rename(columns=fix_col, inplace=True)
df_1

Unnamed: 0,first_name,last_name,donation_amount
0,Amy,Wang,200
1,Bender,Rodriguez,12
2,Philip,Fry,70


In [10]:
df_2 = pd.read_csv('points.csv')
df_2

Unnamed: 0,x,y,color,visible
0,1,1,0xFF0000,yes
1,2,2,0x00FF00,no
2,3,3,0x0000FF,yes


In [11]:
df_2.dtypes

x           int64
y           int64
color      object
visible    object
dtype: object

In [12]:
def asint(val):
    return int(val, base=0)

In [13]:
df_2['color'] = df_2['color'].apply(asint)
df_2.dtypes

x           int64
y           int64
color       int64
visible    object
dtype: object

In [14]:
bools = {
    'yes': True,
    'no': False,
}
df_2['visible'] = df_2['visible'].map(bools)
df_2.dtypes

x          int64
y          int64
color      int64
visible     bool
dtype: object

In [15]:
df_2

Unnamed: 0,x,y,color,visible
0,1,1,16711680,True
1,2,2,65280,False
2,3,3,255,True


In [16]:
csv_file = '2021-06.csv'
df_3 = pd.read_csv(csv_file)
df_3

Unnamed: 0,day,time,client
0,1,09:00-11:00,ecorp
1,1,12:00-18:00,allsafe
2,2,10:00-19:30,allsafe
3,3,11:30-17:00,ecorp


In [17]:
df_3['date'] = csv_file[:-len('.csv')]
df_3

Unnamed: 0,day,time,client,date
0,1,09:00-11:00,ecorp,2021-06
1,1,12:00-18:00,allsafe,2021-06
2,2,10:00-19:30,allsafe,2021-06
3,3,11:30-17:00,ecorp,2021-06


In [18]:
times = df_3['time'].str.split('-', expand=True)
times.columns = ['start', 'end']
times

Unnamed: 0,start,end
0,09:00,11:00
1,12:00,18:00
2,10:00,19:30
3,11:30,17:00


In [19]:
df_3 = pd.concat([df_3, times], axis=1)
df_3

Unnamed: 0,day,time,client,date,start,end
0,1,09:00-11:00,ecorp,2021-06,09:00,11:00
1,1,12:00-18:00,allsafe,2021-06,12:00,18:00
2,2,10:00-19:30,allsafe,2021-06,10:00,19:30
3,3,11:30-17:00,ecorp,2021-06,11:30,17:00


In [20]:
df_3['start'] = pd.to_datetime(
    df_3['date'].str.cat(df_3['start'], sep='T')
)
df_3['end'] = pd.to_datetime(
    df_3['date'].str.cat(df_3['end'], sep='T')
)
df_3

Unnamed: 0,day,time,client,date,start,end
0,1,09:00-11:00,ecorp,2021-06,2021-06-01 09:00:00,2021-06-01 11:00:00
1,1,12:00-18:00,allsafe,2021-06,2021-06-01 12:00:00,2021-06-01 18:00:00
2,2,10:00-19:30,allsafe,2021-06,2021-06-01 10:00:00,2021-06-01 19:30:00
3,3,11:30-17:00,ecorp,2021-06,2021-06-01 11:30:00,2021-06-01 17:00:00


In [21]:
(df_3['end']-df_3['start']).sum()

Timedelta('0 days 23:00:00')

In [22]:
df_4 = pd.read_csv('rides_1.csv')
df_4

Unnamed: 0,name,plate,distance
0,Gomez,1XZ2,3.7
1,Morticia,,2.1
2,Fester,,3.4
3,Lurch,Q38X3,-3.2
4,,03A,14.3
5,Wednesday,A,0.3
6,Pugsley,ZF003,153.14


In [23]:
mask = df_4.eval('name.isnull() | distance <= 0')
mask

0    False
1    False
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [24]:
df_4[~mask]

Unnamed: 0,name,plate,distance
0,Gomez,1XZ2,3.7
1,Morticia,,2.1
2,Fester,,3.4
5,Wednesday,A,0.3
6,Pugsley,ZF003,153.14


In [26]:
df_4 = df_4.reset_index()
df_4

Unnamed: 0,level_0,index,name,plate,distance
0,0,0,Gomez,1XZ2,3.7
1,1,1,Morticia,,2.1
2,2,2,Fester,,3.4
3,3,3,Lurch,Q38X3,-3.2
4,4,4,,03A,14.3
5,5,5,Wednesday,A,0.3
6,6,6,Pugsley,ZF003,153.14


In [27]:
df_5 = pd.read_csv('cart_2.csv', parse_dates=['date'])
df_5

Unnamed: 0,date,name,amount,price
0,2021-03-01,carrot,7.0,5.73
1,2021-03-01,egg,12.0,1.7
2,2021-03-01,milk,,3.57
3,2021-03-01,potato,2.0,
4,NaT,tomato,6.0,1.52
5,2021-03-02,potato,3.0,2.17
6,2021-03-03,,5.0,3.68


In [28]:
df_5['amount'].fillna(1, inplace=True)#inplace=True serves to modify the current data rather than creating new one.
df_5

Unnamed: 0,date,name,amount,price
0,2021-03-01,carrot,7.0,5.73
1,2021-03-01,egg,12.0,1.7
2,2021-03-01,milk,1.0,3.57
3,2021-03-01,potato,2.0,
4,NaT,tomato,6.0,1.52
5,2021-03-02,potato,3.0,2.17
6,2021-03-03,,5.0,3.68


In [29]:
most_common = df_5['name'].mode()[0]
df_5['name'].fillna(most_common, inplace=True)
df_5

Unnamed: 0,date,name,amount,price
0,2021-03-01,carrot,7.0,5.73
1,2021-03-01,egg,12.0,1.7
2,2021-03-01,milk,1.0,3.57
3,2021-03-01,potato,2.0,
4,NaT,tomato,6.0,1.52
5,2021-03-02,potato,3.0,2.17
6,2021-03-03,potato,5.0,3.68


In [30]:
df_5['date'].fillna(method='ffill', inplace=True)
df_5

Unnamed: 0,date,name,amount,price
0,2021-03-01,carrot,7.0,5.73
1,2021-03-01,egg,12.0,1.7
2,2021-03-01,milk,1.0,3.57
3,2021-03-01,potato,2.0,
4,2021-03-01,tomato,6.0,1.52
5,2021-03-02,potato,3.0,2.17
6,2021-03-03,potato,5.0,3.68


In [31]:
prices = df_5.groupby('name')['price'].transform(np.mean)
prices

0    5.730
1    1.700
2    3.570
3    2.925
4    1.520
5    2.925
6    2.925
Name: price, dtype: float64

In [32]:
df_5['price'].fillna(prices, inplace=True)
df_5

Unnamed: 0,date,name,amount,price
0,2021-03-01,carrot,7.0,5.73
1,2021-03-01,egg,12.0,1.7
2,2021-03-01,milk,1.0,3.57
3,2021-03-01,potato,2.0,2.925
4,2021-03-01,tomato,6.0,1.52
5,2021-03-02,potato,3.0,2.17
6,2021-03-03,potato,5.0,3.68


In [37]:
df_6 = pd.read_csv('metrics_2.csv', parse_dates=['time'])
df_6

Unnamed: 0,time,cpu,memory
0,2021-07-23 14:33:04,30.2,571.83
1,2021-07-23 14:44:05,32.9,524.72
2,2021-07-23 14:55:06,37.1,617.9


In [38]:
df_6 = pd.melt(
    df_6,
    value_vars=['cpu', 'memory'],
    id_vars=['time'],
    var_name='metric',
)
df_6

Unnamed: 0,time,metric,value
0,2021-07-23 14:33:04,cpu,30.2
1,2021-07-23 14:44:05,cpu,32.9
2,2021-07-23 14:55:06,cpu,37.1
3,2021-07-23 14:33:04,memory,571.83
4,2021-07-23 14:44:05,memory,524.72
5,2021-07-23 14:55:06,memory,617.9


In [35]:
df_6.rename(columns={
    'value': 'CPU&Mem',
},inplace=True)
df_6