In [1]:
import pandas as pd
import numpy as np

### Creating DataFrame from files

DataFrame can be created from three types of files:
<ol>
    <li>Comma Separated Values(CSV files) - read_csv function</li>
    <li> Excel Files - read_excel function</li>
    <li> Text files - read_table function</li>
</ol>

Separators maybe specified while using read_csv or read_table functions

In [2]:
students = pd.read_csv('StudentsPerformance.csv')

#### Display first five records

In [3]:
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
students.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [7]:
students.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

#### Display Last Eight Records

In [8]:
students.tail(8)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
994,male,group A,high school,standard,none,63,63,62
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


### General attributes of dataframe

In [9]:
# returns number of data(cells) in the dataframe
students.size

8000

In [11]:
# returns the number of non-null values in each column
students.count()

gender                         1000
race/ethnicity                 1000
parental level of education    1000
lunch                          1000
test preparation course        1000
math score                     1000
reading score                  1000
writing score                  1000
dtype: int64

In [12]:
# Return a tuple of the number of records(rows) and number of attributes (columns)
students.shape

(1000, 8)

In [13]:
# Return number of dimensions of the dataframe
students.ndim

2

#### Summary about dataframe

In [14]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


#### Statistical Description about Dataframe
<ul>
    <li>By default it shows only for numerical attributes</li>
    <li>Keyword arguments shall be given to specifically display attributes</li>
    <li>For categorical attributes count,unique,top,freq are displayed</li>
</ul>
    


In [16]:
students.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [17]:
s = pd.Series([1, 2, 3, 1])

In [18]:
s

0    1
1    2
2    3
3    1
dtype: int64

In [19]:
s.describe()

count    4.000000
mean     1.750000
std      0.957427
min      1.000000
25%      1.000000
50%      1.500000
75%      2.250000
max      3.000000
dtype: float64

In [23]:
students.quantile(0.9)

math score       86.0
reading score    87.1
writing score    87.0
Name: 0.9, dtype: float64

In [25]:
students.describe(include=[object])

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
count,1000,1000,1000,1000,1000
unique,2,5,6,2,2
top,female,group C,some college,standard,none
freq,518,319,226,645,642


In [26]:
students.describe(exclude=[object])

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [24]:
students.describe(include='all')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
count,1000,1000,1000,1000,1000,1000.0,1000.0,1000.0
unique,2,5,6,2,2,,,
top,female,group C,some college,standard,none,,,
freq,518,319,226,645,642,,,
mean,,,,,,66.089,69.169,68.054
std,,,,,,15.16308,14.600192,15.195657
min,,,,,,0.0,17.0,10.0
25%,,,,,,57.0,59.0,57.75
50%,,,,,,66.0,70.0,69.0
75%,,,,,,77.0,79.0,79.0


### Number of bytes consumed

#### By DataFrame

In [27]:
students.memory_usage()

Index                           128
gender                         8000
race/ethnicity                 8000
parental level of education    8000
lunch                          8000
test preparation course        8000
math score                     8000
reading score                  8000
writing score                  8000
dtype: int64

#### By a Specific Columns

In [28]:
students['math score'].nbytes

8000

### Accessing all entries in a Column

In [29]:
#Square brackets
students['math score']

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

In [30]:
#Print all column names
students.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [32]:
#Dot operator
students.math score

SyntaxError: ignored

### Retrieving rows

#### By default numerical index given by Pandas

In [34]:
students.iloc[4]

gender                                 male
race/ethnicity                      group C
parental level of education    some college
lunch                              standard
test preparation course                none
math score                               76
reading score                            78
writing score                            75
Name: 4, dtype: object

#### Accessing a data in dataframe

#### Using row and column indices given by Pandas

In [36]:
students.iat[3,2]

"associate's degree"

#### Using indices specified by programmer

In [37]:
students.at[1,'parental level of education']

'some college'

###Find unique values in a column

#### Print number of unique values in each column of DataFrame

In [38]:
students.nunique()

gender                          2
race/ethnicity                  5
parental level of education     6
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

#### Unique values in a particular column

In [39]:
students['parental level of education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

#### Print count of unique values of a particular column

In [40]:
students['parental level of education'].value_counts()

some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64