In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv"
df = pd.read_csv(url)
df.isna().sum()

row_n           0
id              1
name            0
gender          0
species         0
birthday        0
personality     0
song           11
phrase          0
full_id         0
url             0
dtype: int64

In [3]:
# List all column names
columns = df.columns
print("Columns in the dataset:")
print(columns)

Columns in the dataset:
Index(['row_n', 'id', 'name', 'gender', 'species', 'birthday', 'personality',
       'song', 'phrase', 'full_id', 'url'],
      dtype='object')


In [4]:
# Get the number of rows and columns
num_rows, num_columns = df.shape
print(f'Number of rows: {num_rows}')
print(f'Number of columns: {num_columns}')

Number of rows: 391
Number of columns: 11


The meaning of "observations" is that the objects that we observed directly, just a collection of measured values.
The meaning of "variables" is that the specific attribute of an individual that we are curious about.

In [7]:
# Display the first few rows of the dataset
print(df.head())

   row_n       id     name  gender    species birthday personality  \
0      2  admiral  Admiral    male       bird     1-27      cranky   
1      3  agent-s  Agent S  female   squirrel      7-2       peppy   
2      4    agnes    Agnes  female        pig     4-21        uchi   
3      6       al       Al    male    gorilla    10-18        lazy   
4      7  alfonso  Alfonso    male  alligator      6-9        lazy   

          song    phrase           full_id  \
0   Steep Hill   aye aye  villager-admiral   
1      DJ K.K.  sidekick  villager-agent-s   
2   K.K. House   snuffle    villager-agnes   
3   Steep Hill   Ayyeeee       villager-al   
4  Forest Life  it'sa me  villager-alfonso   

                                                 url  
0  https://villagerdb.com/images/villagers/thumb/...  
1  https://villagerdb.com/images/villagers/thumb/...  
2  https://villagerdb.com/images/villagers/thumb/...  
3  https://villagerdb.com/images/villagers/thumb/...  
4  https://villagerdb.com/i

In [8]:
# Summary statistics for numeric columns
numeric_summary = df.describe()
print("Summary statistics for numeric columns:")
print(numeric_summary)

Summary statistics for numeric columns:
            row_n
count  391.000000
mean   239.902813
std    140.702672
min      2.000000
25%    117.500000
50%    240.000000
75%    363.500000
max    483.000000


In [9]:
# Summary statistics for all columns, including categorical data
print(df.describe(include='all'))


             row_n       id     name gender species birthday personality  \
count   391.000000      390      391    391     391      391         391   
unique         NaN      390      391      2      35      361           8   
top            NaN  admiral  Admiral   male     cat     1-27        lazy   
freq           NaN        1        1    204      23        2          60   
mean    239.902813      NaN      NaN    NaN     NaN      NaN         NaN   
std     140.702672      NaN      NaN    NaN     NaN      NaN         NaN   
min       2.000000      NaN      NaN    NaN     NaN      NaN         NaN   
25%     117.500000      NaN      NaN    NaN     NaN      NaN         NaN   
50%     240.000000      NaN      NaN    NaN     NaN      NaN         NaN   
75%     363.500000      NaN      NaN    NaN     NaN      NaN         NaN   
max     483.000000      NaN      NaN    NaN     NaN      NaN         NaN   

                song   phrase           full_id  \
count            380      391       

In [10]:
# Check for missing values
print(df.isnull().sum())


row_n           0
id              1
name            0
gender          0
species         0
birthday        0
personality     0
song           11
phrase          0
full_id         0
url             0
dtype: int64


In [11]:
# Summary statistics for numerical columns
print(df.describe())


            row_n
count  391.000000
mean   239.902813
std    140.702672
min      2.000000
25%    117.500000
50%    240.000000
75%    363.500000
max    483.000000


In [16]:
print(df.shape)

(391, 11)


In [None]:
(a) The Number of Columns Analyzed
df.shape:
df.shape returns the overall size of the dataset, showing all rows and columns in the form of a tuple
df.describe():
By default, df.describe() only analyzes numeric columns

(b) The "Count" Column
df.shape:
The number of rows in df.shape represents the total number of rows in the dataset, regardless of whether they contain missing data or not.
df.describe():
The "count" column in df.describe() shows the number of non-null values in each column. It represents how many rows have valid data (i.e., non-missing values) for that specific column.

In [12]:
# View value counts for a specific column (e.g., species)
print(df['species'].value_counts())


species
cat          23
rabbit       20
frog         18
squirrel     18
duck         17
dog          16
cub          16
pig          15
bear         15
mouse        15
horse        15
bird         13
penguin      13
sheep        13
elephant     11
wolf         11
ostrich      10
deer         10
eagle         9
gorilla       9
chicken       9
koala         9
goat          8
hamster       8
kangaroo      8
monkey        8
anteater      7
hippo         7
tiger         7
alligator     7
lion          7
bull          6
rhino         6
cow           4
octopus       3
Name: count, dtype: int64


In [13]:
# Overview of dataset
print(df.info())

# Summary statistics for numeric columns
print(df.describe())

# Summary statistics for all columns
print(df.describe(include='all'))

# Check missing values
print(df.isnull().sum())

# View value counts for the 'species' column
print(df['species'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391 entries, 0 to 390
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   row_n        391 non-null    int64 
 1   id           390 non-null    object
 2   name         391 non-null    object
 3   gender       391 non-null    object
 4   species      391 non-null    object
 5   birthday     391 non-null    object
 6   personality  391 non-null    object
 7   song         380 non-null    object
 8   phrase       391 non-null    object
 9   full_id      391 non-null    object
 10  url          391 non-null    object
dtypes: int64(1), object(10)
memory usage: 33.7+ KB
None
            row_n
count  391.000000
mean   239.902813
std    140.702672
min      2.000000
25%    117.500000
50%    240.000000
75%    363.500000
max    483.000000
             row_n       id     name gender species birthday personality  \
count   391.000000      390      391    391     391      391       

In [14]:
# Check data types of all columns
print(df.dtypes)

# List non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_columns)

row_n           int64
id             object
name           object
gender         object
species        object
birthday       object
personality    object
song           object
phrase         object
full_id        object
url            object
dtype: object
Non-numeric columns: Index(['id', 'name', 'gender', 'species', 'birthday', 'personality', 'song',
       'phrase', 'full_id', 'url'],
      dtype='object')


In [15]:
# Check the shape of the dataset
print("Shape of the dataset:", df.shape)

# Default describe (numeric columns only)
print("Summary statistics for numeric columns:")
print(df.describe())

# Describe including all columns
print("Summary statistics for all columns:")
print(df.describe(include='all'))


Shape of the dataset: (391, 11)
Summary statistics for numeric columns:
            row_n
count  391.000000
mean   239.902813
std    140.702672
min      2.000000
25%    117.500000
50%    240.000000
75%    363.500000
max    483.000000
Summary statistics for all columns:
             row_n       id     name gender species birthday personality  \
count   391.000000      390      391    391     391      391         391   
unique         NaN      390      391      2      35      361           8   
top            NaN  admiral  Admiral   male     cat     1-27        lazy   
freq           NaN        1        1    204      23        2          60   
mean    239.902813      NaN      NaN    NaN     NaN      NaN         NaN   
std     140.702672      NaN      NaN    NaN     NaN      NaN         NaN   
min       2.000000      NaN      NaN    NaN     NaN      NaN         NaN   
25%     117.500000      NaN      NaN    NaN     NaN      NaN         NaN   
50%     240.000000      NaN      NaN    NaN   

Attributes: represent stored data about an object
Methods: are functions that perform an action or computation