In [None]:
https://github.com/Devinterview-io/pandas-interview-questions

In [1]:
import pandas as pd

# Create a DataFrame from a dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'], 
    'Age': [25, 30, 35, 40],
    'Department': ['HR', 'Finance', 'IT', 'Marketing']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Department
0,Alice,25,HR
1,Bob,30,Finance
2,Charlie,35,IT
3,Diana,40,Marketing


In [3]:
#df.describe()

In [5]:
# Filter and sort the data 
df[df['Department'].isin(['HR', 'IT'])]

Unnamed: 0,Name,Age,Department
0,Alice,25,HR
2,Charlie,35,IT


In [7]:
df.sort_values(by='Age', ascending = False)

Unnamed: 0,Name,Age,Department
3,Diana,40,Marketing
2,Charlie,35,IT
1,Bob,30,Finance
0,Alice,25,HR


In [8]:
# Handle missing data

In [11]:
df.at[2, 'Age'] = None        # Simulate missing age for 'Charlie'
df

Unnamed: 0,Name,Age,Department
0,Alice,25.0,HR
1,Bob,30.0,Finance
2,Charlie,,IT
3,Diana,40.0,Marketing


In [12]:
df.dropna(inplace=True)  # Drop rows with any missing data

In [13]:
df

Unnamed: 0,Name,Age,Department
0,Alice,25.0,HR
1,Bob,30.0,Finance
3,Diana,40.0,Marketing


In [14]:
# Group, aggregate, 

In [15]:
df.groupby('Department')['Age'].mean()

Department
Finance      30.0
HR           25.0
Marketing    40.0
Name: Age, dtype: float64

In [16]:
data = {'A': [1, 2, 3, 4, 5],
        'B': [10, 20, 30, 40, 50],
        'C': ['foo', 'bar', 'baz', 'qux', 'quux']}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C
0,1,10,foo
1,2,20,bar
2,3,30,baz
3,4,40,qux
4,5,50,quux


In [17]:
# Slicing 
df.iloc[1:4, 0:2]

Unnamed: 0,A,B
1,2,20
2,3,30
3,4,40


In [18]:
# Filtering
df[df['A'] > 2]

Unnamed: 0,A,B,C
2,3,30,baz
3,4,40,qux
4,5,50,quux


In [20]:
data = {'names': ['Alice', 'Bob', 'Charlie'], 'scores': [80, 90, 85]}
df = pd.DataFrame(data)
df

Unnamed: 0,names,scores
0,Alice,80
1,Bob,90
2,Charlie,85


In [21]:
# Doubling the scores using .apply() and a lambda function

df['scores'].apply(lambda x: x*2)

0    160
1    180
2    170
Name: scores, dtype: int64

In [23]:
data = {'A': [1, 1, 2, 2, 3, 3], 'B': ['a', 'a', 'b', 'b', 'c', 'c']}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1,a
1,1,a
2,2,b
3,2,b
4,3,c
5,3,c


In [24]:
df.drop_duplicates()

Unnamed: 0,A,B
0,1,a
2,2,b
4,3,c


In [25]:
# Alternatively, you can keep the last occurrence
df.drop_duplicates(keep='last')


Unnamed: 0,A,B
1,1,a
3,2,b
5,3,c


In [26]:
df.groupby('A').mean()

  df.groupby('A').mean()


1
2
3


In [27]:
# Count duplicates
df.duplicated().sum()

3

In [28]:
data = {
    'Date': ['2020-01-01', '2020-01-01', '2020-01-02', '2020-01-02'],
    'Category': ['A', 'B', 'A', 'B'],
    'Value': [10, 20, 30, 40]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Date,Category,Value
0,2020-01-01,A,10
1,2020-01-01,B,20
2,2020-01-02,A,30
3,2020-01-02,B,40


In [29]:
df.pivot(index='Date', columns = 'Category', values = 'Value')


Category,A,B
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01,10,20
2020-01-02,30,40


In [30]:
data = {'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# Applying 'where'
df.where(df > 2, df * 10)

Unnamed: 0,A,B
0,10,10
1,20,20
2,3,30
3,4,40
4,5,50
