In [2]:
import pandas as pd
import numpy as np

In [50]:
data = {
"Name": ["Alice","Bob","Charlie","David","Eva"],
"Age": [25,30,35,28,40],
"City": ["NY","London","Paris","NY","London"],
"Salary": [50000,60000,70000,55000,80000],
"JoiningDate": ["2020-01-15","2019-07-22","2021-03-10","2020-12-05","2018-06-30"]
}
df = pd.DataFrame(data)
print(df)

      Name  Age    City  Salary JoiningDate
0    Alice   25      NY   50000  2020-01-15
1      Bob   30  London   60000  2019-07-22
2  Charlie   35   Paris   70000  2021-03-10
3    David   28      NY   55000  2020-12-05
4      Eva   40  London   80000  2018-06-30


In [51]:
print(df.head())

      Name  Age    City  Salary JoiningDate
0    Alice   25      NY   50000  2020-01-15
1      Bob   30  London   60000  2019-07-22
2  Charlie   35   Paris   70000  2021-03-10
3    David   28      NY   55000  2020-12-05
4      Eva   40  London   80000  2018-06-30


In [52]:
print(df.shape)

(5, 5)


In [53]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         5 non-null      object
 1   Age          5 non-null      int64 
 2   City         5 non-null      object
 3   Salary       5 non-null      int64 
 4   JoiningDate  5 non-null      object
dtypes: int64(2), object(3)
memory usage: 332.0+ bytes
None


In [54]:
print(df.describe())

            Age        Salary
count   5.00000      5.000000
mean   31.60000  63000.000000
std     5.94138  12041.594579
min    25.00000  50000.000000
25%    28.00000  55000.000000
50%    30.00000  60000.000000
75%    35.00000  70000.000000
max    40.00000  80000.000000


In [55]:
print(df.isnull().sum())

Name           0
Age            0
City           0
Salary         0
JoiningDate    0
dtype: int64


In [56]:
print(df["City"].unique())

['NY' 'London' 'Paris']


In [67]:
df["Experience"] = [5,7,3,4,10]
df["Salary"] = df["Salary"] * 1.1
df["Age"] = df["Age"].fillna(df["Age"].mean())
print(df)


         Age    City    Salary JoiningDate  Experience AgeGroup  \
Name                                                              
Eva       40  London  106480.0  2018-06-30           5   Senior   
Charlie   35   Paris   93170.0  2021-03-10           7    Adult   
Bob       30  London   79860.0  2019-07-22           3    Adult   
David     28      NY   73205.0  2020-12-05           4    Adult   
Alice     25      NY   66550.0  2020-01-15          10    Young   

        SalaryCategory  SalaryK  
Name                             
Eva               High    96.80  
Charlie           High    84.70  
Bob               High    72.60  
David             High    66.55  
Alice             High    60.50  


In [None]:

df.sort_values(by="Salary", ascending=False, inplace=True)
df.set_index("Name", inplace=True)
print(df)


Step 4: Sort & Index
         Age    City   Salary JoiningDate  Experience
Name                                                 
Eva       40  London  96800.0  2018-06-30          10
Charlie   35   Paris  84700.0  2021-03-10           3
Bob       30  London  72600.0  2019-07-22           7
David     28      NY  66550.0  2020-12-05           4
Alice     25      NY  60500.0  2020-01-15           5


In [60]:
df["AgeGroup"] = pd.cut(df["Age"], bins=[0,25,35,50], labels=["Young","Adult","Senior"])
df["SalaryCategory"] = df["Salary"].apply(lambda x: "High" if x>60000 else "Low")
df["SalaryK"] = df["Salary"].apply(lambda x: x/1000)
print(df)

         Age    City   Salary JoiningDate  Experience AgeGroup SalaryCategory  \
Name                                                                            
Eva       40  London  96800.0  2018-06-30          10   Senior           High   
Charlie   35   Paris  84700.0  2021-03-10           3    Adult           High   
Bob       30  London  72600.0  2019-07-22           7    Adult           High   
David     28      NY  66550.0  2020-12-05           4    Adult           High   
Alice     25      NY  60500.0  2020-01-15           5    Young           High   

         SalaryK  
Name              
Eva        96.80  
Charlie    84.70  
Bob        72.60  
David      66.55  
Alice      60.50  


In [68]:
pivot = df.pivot_table(
    values="Salary", 
    index="City", 
    columns="AgeGroup", 
    aggfunc="mean", 
    observed=False  # keep current behavior
)
print(pivot)


AgeGroup    Young    Adult    Senior
City                                
London        NaN  79860.0  106480.0
NY        66550.0  73205.0       NaN
Paris         NaN  93170.0       NaN


In [62]:
scores = pd.DataFrame({"Name":["Alice","Bob","Charlie"],"Score":[90,85,95]})
df_merged = pd.merge(df.reset_index(), scores, on="Name", how="left")


new_employees = pd.DataFrame({
"Name":["Frank","Grace"],
"Age":[29,33],
"City":["Paris","NY"],
"Salary":[60000,65000],
"JoiningDate":["2022-02-10","2021-05-12"],
"Experience":[2,5]
})
df_final = pd.concat([df_merged, new_employees], ignore_index=True)
print(df_final)

      Name  Age    City   Salary JoiningDate  Experience AgeGroup  \
0      Eva   40  London  96800.0  2018-06-30          10   Senior   
1  Charlie   35   Paris  84700.0  2021-03-10           3    Adult   
2      Bob   30  London  72600.0  2019-07-22           7    Adult   
3    David   28      NY  66550.0  2020-12-05           4    Adult   
4    Alice   25      NY  60500.0  2020-01-15           5    Young   
5    Frank   29   Paris  60000.0  2022-02-10           2      NaN   
6    Grace   33      NY  65000.0  2021-05-12           5      NaN   

  SalaryCategory  SalaryK  Score  
0           High    96.80    NaN  
1           High    84.70   95.0  
2           High    72.60   85.0  
3           High    66.55    NaN  
4           High    60.50   90.0  
5            NaN      NaN    NaN  
6            NaN      NaN    NaN  


In [69]:
if 'JoiningDate' not in df_final.columns:
    df_final.reset_index(inplace=True)
df_final['JoiningDate'] = pd.to_datetime(df_final['JoiningDate'])
df_final.set_index('JoiningDate', inplace=True)
monthly_salary = df_final['Salary'].resample('ME').sum()
print(monthly_salary)


JoiningDate
2018-06-30    96800.0
2018-07-31        0.0
2018-08-31        0.0
2018-09-30        0.0
2018-10-31        0.0
2018-11-30        0.0
2018-12-31        0.0
2019-01-31        0.0
2019-02-28        0.0
2019-03-31        0.0
2019-04-30        0.0
2019-05-31        0.0
2019-06-30        0.0
2019-07-31    72600.0
2019-08-31        0.0
2019-09-30        0.0
2019-10-31        0.0
2019-11-30        0.0
2019-12-31        0.0
2020-01-31    60500.0
2020-02-29        0.0
2020-03-31        0.0
2020-04-30        0.0
2020-05-31        0.0
2020-06-30        0.0
2020-07-31        0.0
2020-08-31        0.0
2020-09-30        0.0
2020-10-31        0.0
2020-11-30        0.0
2020-12-31    66550.0
2021-01-31        0.0
2021-02-28        0.0
2021-03-31    84700.0
2021-04-30        0.0
2021-05-31    65000.0
2021-06-30        0.0
2021-07-31        0.0
2021-08-31        0.0
2021-09-30        0.0
2021-10-31        0.0
2021-11-30        0.0
2021-12-31        0.0
2022-01-31        0.0
2022-02-28    60000.

In [64]:
df_encoded = pd.get_dummies(df_final, columns=["City"])
df_encoded["AgeGroupCode"] = df_encoded["AgeGroup"].astype("category").cat.codes
print(df_encoded)

                Name  Age   Salary  Experience AgeGroup SalaryCategory  \
JoiningDate                                                              
2018-06-30       Eva   40  96800.0          10   Senior           High   
2021-03-10   Charlie   35  84700.0           3    Adult           High   
2019-07-22       Bob   30  72600.0           7    Adult           High   
2020-12-05     David   28  66550.0           4    Adult           High   
2020-01-15     Alice   25  60500.0           5    Young           High   
2022-02-10     Frank   29  60000.0           2      NaN            NaN   
2021-05-12     Grace   33  65000.0           5      NaN            NaN   

             SalaryK  Score  City_London  City_NY  City_Paris  AgeGroupCode  
JoiningDate                                                                  
2018-06-30     96.80    NaN         True    False       False             2  
2021-03-10     84.70   95.0        False    False        True             1  
2019-07-22     72.60 

In [66]:
X = df_encoded[["Age","Salary","Experience","AgeGroupCode"]].values
y = df_encoded["Score"].fillna(0).values
numeric_df = df_encoded.select_dtypes(include=np.number)
print(numeric_df.corr())
print("\nX shape:", X.shape, "y shape:", y.shape)

                   Age    Salary  Experience   SalaryK  Score  AgeGroupCode
Age           1.000000  0.893894    0.543949  0.999389    0.5      0.491822
Salary        0.893894  1.000000    0.640182  1.000000    0.5      0.808603
Experience    0.543949  0.640182    1.000000  0.546177   -1.0      0.636396
SalaryK       0.999389  1.000000    0.546177  1.000000    0.5      0.880830
Score         0.500000  0.500000   -1.000000  0.500000    1.0      0.000000
AgeGroupCode  0.491822  0.808603    0.636396  0.880830    0.0      1.000000

X shape: (7, 4) y shape: (7,)
