In [1]:
import numpy as np
import pandas as pd
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

df =pd.DataFrame(data,index=labels)


In [3]:
df.describe()

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   animal    10 non-null     object 
 1   age       8 non-null      float64
 2   visits    10 non-null     int64  
 3   priority  10 non-null     object 
dtypes: float64(1), int64(1), object(2)
memory usage: 700.0+ bytes


In [6]:
df.head(3)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [8]:
df[['animal','age']]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [None]:
selected_data = df.loc[['d', 'e', 'i'], ['animal', 'age']]
print(selected_data)

In [9]:
# Select rows where the number of visits is greater than 3
df[df['visits'] > 3]



Unnamed: 0,animal,age,visits,priority


In [10]:
# Select rows where the age is missing (NaN)
df[df['age'].isna()]



Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


In [11]:
# Select rows where the animal is a cat and the age is less than 3
df[(df['animal'] == 'cat') & (df['age'] < 3)]



Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
f,cat,2.0,3,no


In [12]:
# Select rows where the age is between 2 and 4 (inclusive)
df[(df['age'] >= 2) & (df['age'] <= 4)]


Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
f,cat,2.0,3,no
j,dog,3.0,1,no


In [13]:
# Change the age in row 'f' to 1.5
df.loc['f', 'age'] = 1.5

In [16]:
# Calculate the sum of all visits in df
df['visits'].sum()



19

In [17]:
# Calculate the mean age for each different animal
df.groupby('animal')['age'].mean()


animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64

In [18]:
df.loc['k'] = ['python', 2.5, 1, 'yes']


df = df.drop('k')

In [19]:
df['animal'].value_counts()

cat      4
dog      4
snake    2
Name: animal, dtype: int64

In [20]:
df.sort_values(by=['age', 'visits'], ascending=[False, True])

Unnamed: 0,animal,age,visits,priority
i,dog,7.0,2,no
e,dog,5.0,2,no
g,snake,4.5,1,no
j,dog,3.0,1,no
b,cat,3.0,3,yes
a,cat,2.5,1,yes
f,cat,1.5,3,no
c,snake,0.5,2,no
h,cat,,1,yes
d,dog,,3,yes


In [22]:
# Replace 'yes' with True and 'no' with False in the 'priority' column
df['priority'] = df['priority'].replace({'yes': True, 'no': False})
df


Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3.0,3,True
c,snake,0.5,2,False
d,dog,,3,True
e,dog,5.0,2,False
f,cat,1.5,3,False
g,snake,4.5,1,False
h,cat,,1,True
i,dog,7.0,2,False
j,dog,3.0,1,False


In [23]:
# Change 'snake' entries in the 'animal' column to 'python'
df['animal'] = df['animal'].replace('snake', 'python')
df


Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3.0,3,True
c,python,0.5,2,False
d,dog,,3,True
e,dog,5.0,2,False
f,cat,1.5,3,False
g,python,4.5,1,False
h,cat,,1,True
i,dog,7.0,2,False
j,dog,3.0,1,False


In [24]:
# Create a pivot table to find the mean age for each animal type and each number of visits
mean_age_pivot = df.pivot_table(values='age', index='animal', columns='visits', aggfunc='mean')
mean_age_pivot



visits    1    2     3
animal                
cat     2.5  NaN  2.25
dog     3.0  6.0   NaN
python  4.5  0.5   NaN


In [26]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})

df[df['A'] != df['A'].shift()]

Unnamed: 0,A
0,1
1,2
3,3
4,4
5,5
8,6
9,7


In [29]:
df = pd.DataFrame(np.random.random(size=(5, 3)))
df.apply(lambda row: row - row.mean(), axis=1)


Unnamed: 0,0,1,2
0,-0.318928,-0.019286,0.338214
1,-0.025202,-0.060136,0.085337
2,0.011823,-0.193325,0.181502
3,-0.067028,0.136622,-0.069594
4,-0.190362,0.410438,-0.220076


In [30]:
df = pd.DataFrame(np.random.random(size=(5, 10)), columns=list('abcdefghij'))
df.sum().idxmin()

'b'

In [31]:
df = pd.DataFrame(np.random.randint(0, 2, size=(10, 3)))
unique_rows_count = df.drop_duplicates().shape[0]

In [36]:
nan = np.nan

data = [[0.04,  nan,  nan, 0.25,  nan, 0.43, 0.71, 0.51,  nan,  nan],
        [ nan,  nan,  nan, 0.04, 0.76,  nan,  nan, 0.67, 0.76, 0.16],
        [ nan,  nan, 0.5 ,  nan, 0.31, 0.4 ,  nan,  nan, 0.24, 0.01],
        [0.49,  nan,  nan, 0.62, 0.73, 0.26, 0.85,  nan,  nan,  nan],
        [ nan,  nan, 0.41,  nan, 0.05,  nan, 0.61,  nan, 0.48, 0.68]]

columns = list('abcdefghij')
def find_third_nan_index(row):
    nan_count = 0
    for idx, value in enumerate(row):
        if pd.isna(value):
            nan_count += 1
            if nan_count == 3:
                return idx

third_nan_columns = df.apply(find_third_nan_index, axis=1)
pd.Series(third_nan_columns, index=df.index)

0    4
1    2
2    3
3    7
4    3
dtype: int64

In [39]:
df = pd.DataFrame({'grps': list('aaabbcaabcccbbc'), 'vals': [12, 345, 3, 1, 45, 14, 4, 52, 54, 23, 235, 21, 57, 3, 87]})

df.groupby('grps')['vals'].sum()

grps
a    416
b    160
c    380
Name: vals, dtype: int64