# Make a DataFrame


In [4]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'one': [1, 2, 3], 'two': [4, 5, 6], 'three': [7, 8, 9]})
df


Unnamed: 0,one,two,three
0,1,4,7
1,2,5,8
2,3,6,9


In [5]:
df.head(2)

Unnamed: 0,one,two,three
0,1,4,7
1,2,5,8


In [10]:
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

pd.DataFrame(arr)

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [12]:
pd.DataFrame(np.random.rand(5,5), columns=list('ABCDE'))

Unnamed: 0,A,B,C,D,E
0,0.060503,0.669181,0.10771,0.759583,0.778674
1,0.20518,0.79412,0.646395,0.972543,0.034445
2,0.608898,0.281901,0.30954,0.559222,0.649854
3,0.554984,0.830048,0.556608,0.43787,0.094691
4,0.69784,0.35226,0.240691,0.275828,0.794267


# How to rename columns

In [13]:
df

Unnamed: 0,one,two,three
0,1,4,7
1,2,5,8
2,3,6,9


In [21]:
df.rename(columns={'one': 'first', 'tow': 'second', 'three': 'third'}, inplace=True)
df

Unnamed: 0,first,two,third
0,1,4,7
1,2,5,8
2,3,6,9


In [23]:
df.columns=['col1', 'col2', 'col3']
df

Unnamed: 0,col1,col2,col3
0,1,4,7
1,2,5,8
2,3,6,9


In [25]:
df.columns.str.replace('col1', 'first')
df

Unnamed: 0,col1,col2,col3
0,1,4,7
1,2,5,8
2,3,6,9


In [4]:
# Import seaborn
import seaborn as sns
df = sns.load_dataset('titanic')
df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
df.loc[::-1].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True


In [6]:
df.loc[::-1].reset_index(drop=True).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
1,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
2,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
3,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
4,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True


# Reverse Row Order

In [8]:
df.loc[:,::-1].head()

Unnamed: 0,alone,alive,embark_town,deck,adult_male,who,class,embarked,fare,parch,sibsp,age,sex,pclass,survived
0,False,no,Southampton,,True,man,Third,S,7.25,0,1,22.0,male,3,0
1,False,yes,Cherbourg,C,False,woman,First,C,71.2833,0,1,38.0,female,1,1
2,True,yes,Southampton,,False,woman,Third,S,7.925,0,0,26.0,female,3,1
3,False,yes,Southampton,C,False,woman,First,S,53.1,0,1,35.0,female,1,1
4,True,no,Southampton,,True,man,Third,S,8.05,0,0,35.0,male,3,0


# Select a column based on dtype

In [9]:
# only select those having numeric dtype
df.select_dtypes(include=['number']).head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [10]:
#exclude those having numeric dtype
df.select_dtypes(exclude=['number']).head()

Unnamed: 0,sex,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,male,S,Third,man,True,,Southampton,no,False
1,female,C,First,woman,False,C,Cherbourg,yes,False
2,female,S,Third,woman,False,,Southampton,yes,True
3,female,S,First,woman,False,C,Southampton,yes,False
4,male,S,Third,man,True,,Southampton,no,True


# Convert strings to numbers

In [13]:
import pandas as pd
df = pd.DataFrame([{'col_A': 1, 'col_B': 2}, {'col_A': 3, 'col_B': 4}])
df.dtypes

col_A    int64
col_B    int64
dtype: object

In [19]:
df.astype({'col_A': 'object', 'col_B': 'object'}).dtypes


col_A    object
col_B    object
dtype: object

# Reduce the size of dataframe

In [21]:
df = sns.load_dataset('titanic')
df.shape

(891, 15)

In [22]:
small_df = df.sample(frac=0.05)
small_df.shape

(45, 15)

In [25]:
small_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, 719 to 561
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     45 non-null     int64   
 1   pclass       45 non-null     int64   
 2   sex          45 non-null     object  
 3   age          36 non-null     float64 
 4   sibsp        45 non-null     int64   
 5   parch        45 non-null     int64   
 6   fare         45 non-null     float64 
 7   embarked     45 non-null     object  
 8   class        45 non-null     category
 9   who          45 non-null     object  
 10  adult_male   45 non-null     bool    
 11  deck         10 non-null     category
 12  embark_town  45 non-null     object  
 13  alive        45 non-null     object  
 14  alone        45 non-null     bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 4.9+ KB


# DA&ML Day-10

In [3]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [7]:
# load dataset
df = sns.load_dataset('titanic')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [16]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [15]:
df.isnull().sum() #counting the missing values
df.isnull().sum() / len(df) * 100 #percentage of missing values


survived        0.000000
pclass          0.000000
sex             0.000000
age            19.865320
sibsp           0.000000
parch           0.000000
fare            0.000000
embarked        0.224467
class           0.000000
who             0.000000
adult_male      0.000000
deck           77.216611
embark_town     0.224467
alive           0.000000
alone           0.000000
dtype: float64

In [21]:
df['sex'].unique() # Finding the unique values
df.sex.unique()

array(['male', 'female'], dtype=object)