In [50]:
print("Hello world")

Hello world


In [51]:
import pandas as pd

df_A = pd.DataFrame({
    'Temperature': [20, 30, 40, 50, 60]},
    index = ['Day1', 'Day2', 'Day3', 'Day4', 'Day5']
)

df_B = pd.DataFrame({
    'Humidity': [120, 130, 140, 150, 160]},
    index = ['Day1', 'Day2', 'Day3', 'Day5', 'Day7']
)

In [52]:
joined_df = df_A.join(df_B, how="right")
print(joined_df)

      Temperature  Humidity
Day1         20.0       120
Day2         30.0       130
Day3         40.0       140
Day5         60.0       150
Day7          NaN       160


# Reshaping DataFrame

In [53]:
df_wide = pd.DataFrame({
    'Name': ['Angel', 'Gehendra', 'Kaushal'],
    'Math': [75, 90, 80],
    'Science': [90, 95, 100],
    'English': [93, 92, 97
                ],
})

print(df_wide)

       Name  Math  Science  English
0     Angel    75       90       93
1  Gehendra    90       95       92
2   Kaushal    80      100       97


In [54]:
df_long = pd.melt(frame=df_wide,
                  id_vars=['Name'],
                  var_name='Subject',
                  value_name='Marks')

print(df_long)

       Name  Subject  Marks
0     Angel     Math     75
1  Gehendra     Math     90
2   Kaushal     Math     80
3     Angel  Science     90
4  Gehendra  Science     95
5   Kaushal  Science    100
6     Angel  English     93
7  Gehendra  English     92
8   Kaushal  English     97


## Pivot

In [55]:
df_pivot = df_long.pivot(
    index = 'Name',
    columns=['Subject'],
    values="Marks"
)

df_pivot

Subject,English,Math,Science
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Angel,93,75,90
Gehendra,92,90,95
Kaushal,97,80,100


In [56]:
stacked_df = df_long.stack()
stacked_df

0  Name          Angel
   Subject        Math
   Marks            75
1  Name       Gehendra
   Subject        Math
   Marks            90
2  Name        Kaushal
   Subject        Math
   Marks            80
3  Name          Angel
   Subject     Science
   Marks            90
4  Name       Gehendra
   Subject     Science
   Marks            95
5  Name        Kaushal
   Subject     Science
   Marks           100
6  Name          Angel
   Subject     English
   Marks            93
7  Name       Gehendra
   Subject     English
   Marks            92
8  Name        Kaushal
   Subject     English
   Marks            97
dtype: object

In [57]:
unstacked_df = stacked_df.unstack()
unstacked_df

Unnamed: 0,Name,Subject,Marks
0,Angel,Math,75
1,Gehendra,Math,90
2,Kaushal,Math,80
3,Angel,Science,90
4,Gehendra,Science,95
5,Kaushal,Science,100
6,Angel,English,93
7,Gehendra,English,92
8,Kaushal,English,97


In [58]:
pivot_table_df = pd.pivot_table(
    data=df_long,
    index = 'Name',
    columns=['Subject'],
    values="Marks",
    aggfunc='mean'
)

pivot_table_df

Subject,English,Math,Science
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Angel,93.0,75.0,90.0
Gehendra,92.0,90.0,95.0
Kaushal,97.0,80.0,100.0


# Handling missing, categorical, and time-series data

In [59]:
df = pd.read_csv("test.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Roll     5 non-null      int64  
 1   Name     4 non-null      object 
 2   Age      3 non-null      float64
 3   Address  5 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 292.0+ bytes


In [60]:
df.isna().sum()

Roll       0
Name       1
Age        2
Address    0
dtype: int64

In [61]:
#Drop rows with missing rows
df1 = df.dropna()
df1.head()

Unnamed: 0,Roll,Name,Age,Address
0,1,Ram,22.0,Dharan
3,4,Sita,32.0,Kathmandu
4,5,Gopal,42.0,Damak


In [62]:
#Drop rows with missing rows
df2 = df.dropna(axis=1)
df2.head()

Unnamed: 0,Roll,Address
0,1,Dharan
1,2,Biratnagar
2,3,Itahari
3,4,Kathmandu
4,5,Damak


In [65]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df

Unnamed: 0,Roll,Name,Age,Address
0,1,Ram,22.0,Dharan
1,2,Shyam,32.0,Biratnagar
2,3,,32.0,Itahari
3,4,Sita,32.0,Kathmandu
4,5,Gopal,42.0,Damak


In [66]:
df['Name'] = df['Name'].fillna("Unknown")
df.head()

Unnamed: 0,Roll,Name,Age,Address
0,1,Ram,22.0,Dharan
1,2,Shyam,32.0,Biratnagar
2,3,Unknown,32.0,Itahari
3,4,Sita,32.0,Kathmandu
4,5,Gopal,42.0,Damak


In [67]:
df.to_csv("final.csv", index=False)

## Cateogrical Data

In [81]:
df = pd.DataFrame({
    'EID': [101, 102, 103, 104],
    "Name": ["Ram", "Shyam", "Hari", "Sita"],
    "Level": ["Junior", "Senior", "Mid", "Junior"]
})

df.dtypes

EID       int64
Name     object
Level    object
dtype: object

In [82]:
df['Level'] = df['Level'].astype("category")
df.dtypes

EID         int64
Name       object
Level    category
dtype: object

In [83]:
df['Level'] = df["Level"].cat.rename_categories({
    "Junior":"Jr",
    "Senior":"Sr"
})

df.head()

Unnamed: 0,EID,Name,Level
0,101,Ram,Jr
1,102,Shyam,Sr
2,103,Hari,Mid
3,104,Sita,Jr


In [84]:
df['Level_code'] = df["Level"].cat.codes
print(df.head())

   EID   Name Level  Level_code
0  101    Ram    Jr           0
1  102  Shyam    Sr           2
2  103   Hari   Mid           1
3  104   Sita    Jr           0


In [85]:
encoded = pd.get_dummies(df['Level'], prefix='Lev')
df = pd.concat(objs=[df, encoded], axis=1)
print(df)

   EID   Name Level  Level_code  Lev_Jr  Lev_Mid  Lev_Sr
0  101    Ram    Jr           0    True    False   False
1  102  Shyam    Sr           2   False    False    True
2  103   Hari   Mid           1   False     True   False
3  104   Sita    Jr           0    True    False   False


In [86]:
url = 'https://drive.google.com/uc?id=15zVC47Zy5IPINchVmXO0K8XPPYLp0Y_6'
df_time = pd.read_csv(url, parse_dates=['InvoiceDate'], index_col='InvoiceDate')


  df_time = pd.read_csv(url, parse_dates=['InvoiceDate'], index_col='InvoiceDate')


In [87]:
df_time.head()

Unnamed: 0_level_0,Invoice,StockCode,Description,Quantity,Price,Customer ID,Country
InvoiceDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-12-01 07:45:00,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95,13085.0,United Kingdom
2009-12-01 07:45:00,489434,79323P,PINK CHERRY LIGHTS,12,6.75,13085.0,United Kingdom
2009-12-01 07:45:00,489434,79323W,WHITE CHERRY LIGHTS,12,6.75,13085.0,United Kingdom
2009-12-01 07:45:00,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1,13085.0,United Kingdom
2009-12-01 07:45:00,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25,13085.0,United Kingdom


In [89]:
df_time.to_csv("time_series_data_fuse.csv")