In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import warnings
from rapidfuzz import process, fuzz

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# load and inspect the final_dataset
# Import necessary libraries

# Load the parquet file
df = pd.read_parquet('Data Cleaning/final_dataset.parquet')

# Display the shape of the dataset
print(f"Dataset Shape: {df.shape}")
print(f"Number of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]:,}")

print("\n" + "="*50)

# Display column names
print("Column Names:")
print("="*50)
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

print("\n" + "="*50)

# Additional useful information
print("Dataset Info:")
print("="*50)
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Data types:")
print(df.dtypes.value_counts())

print("\n" + "="*50)

# Display first few rows
print("First 5 rows:")
print("="*50)
df.head()

Dataset Shape: (2808, 12)
Number of rows: 2,808
Number of columns: 12

Column Names:
 1. official_name
 2. level
 3. speakers
 4. country
 5. official
 6. regional
 7. minority
 8. national
 9. widely_spoken
10. country_count
11. max_urbanization
12. max_internet_use

Dataset Info:
Memory usage: 0.68 MB
Data types:
int64      7
object     3
float64    2
Name: count, dtype: int64

First 5 rows:


Unnamed: 0,official_name,level,speakers,country,official,regional,minority,national,widely_spoken,country_count,max_urbanization,max_internet_use
0,!Xun,Vulnerable,18000,South Africa;Namibia;Angola,0,0,0,0,0,3,68.819,75.7
1,(Lower) Tanana,Critically Endangered,25,United States,0,0,0,0,0,1,83.298,93.1
2,A'ingae,Vulnerable,1500,Colombia;Ecuador,0,0,0,0,0,2,82.354,77.3
3,Aasáx,Extinct,0,Tanzania,0,0,0,0,0,1,37.409,29.1
4,Abaga,Critically Endangered,5,Papua New Guinea,0,0,0,0,0,1,13.723,24.1


In [16]:
df.head(100)

print(df['widely_spoken'].unique() )
print(df['official'].unique() )
print(df['regional'].unique() )
print(df['national'].unique() )
print(df['minority'].unique() )

filtered_df = df[df['official'] == 5]
print(filtered_df)

[ 0  1  2 36  3  7 12 27 10]
[ 0  2  1  3  7  6  5 96  4 28 11 10 98 61 18 26]
[ 0  1  5  2  3  8 20  7 12  6 67  4]
[ 0  2  1  4  5  3 15 10 50]
[ 0 11  1  8  7  4  3  2 21  5 15  6 75]
     official_name                  level  speakers  \
336             Bo    Severely Endangered        76   
1698          Nese  Critically Endangered        15   
2805       Italian         Not Endangered  66000000   

                                                country  official  regional  \
336                                    Papua New Guinea         5         1   
1698                                            Vanuatu         5         3   
2805  Albania;Argentina;Croatia;Eritrea;Italy;Sovere...         5         0   

      minority  national  widely_spoken  country_count  max_urbanization  \
336          1         3              0              1            13.723   
1698         4         4              7              1            25.976   
2805         1         1              3        