In [2]:
import pandas as pd
import janitor
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv(r'C:\Users\Alvin Nguyen\OneDrive\1. Project\Practice\Internet\data\worldbank-country-internet-data.csv')

In [4]:
df.columns = [col[0:4] if '[' in col else col for col in df.columns]

In [5]:
df = df.clean_names()

In [6]:
df.head(1)

Unnamed: 0,country_name,country_code,series_name,series_code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Afghanistan,AFG,Individuals using the Internet (% of population),IT.NET.USER.ZS,..,..,..,..,..,..,...,7,8.26,11,13.5,16.8,17.6,18.4,..,..,..


In [7]:
selected_years = [str(year) for year in range(1990, 2024)]

selected_columns = ['country_name', 'country_code',	'series_name', 'series_code']

df = df[selected_columns + selected_years]

df.head(1)

Unnamed: 0,country_name,country_code,series_name,series_code,1990,1991,1992,1993,1994,1995,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Afghanistan,AFG,Individuals using the Internet (% of population),IT.NET.USER.ZS,0,0,0,0,0,0,...,7,8.26,11,13.5,16.8,17.6,18.4,..,..,..


In [8]:
selected_countries = ['USA', 'VNM', 'GBR', 'CHN', 'THA', 'SGB']

df = df[df['country_code'].isin(selected_countries)]

df.head()

Unnamed: 0,country_name,country_code,series_name,series_code,1990,1991,1992,1993,1994,1995,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
205,China,CHN,Individuals using the Internet (% of population),IT.NET.USER.ZS,0,0,0,0.00016868,0.001168115,0.00495471,...,47.9,50.3,53.2,54.3,59.2,64.0809,70.0528,73.0532,75.6113,77.4827
206,China,CHN,"Individuals using the Internet, female (% of f...",IT.NET.USER.FE.ZS,..,..,..,..,..,..,...,..,..,..,..,..,63.0484,70.4569,72.6218,75.2437,..
207,China,CHN,"Individuals using the Internet, male (% of mal...",IT.NET.USER.MA.ZS,..,..,..,..,..,..,...,..,..,..,..,..,65.0694,69.6681,73.4646,75.9624,..
208,China,CHN,Secure Internet servers,IT.NET.SECR,..,..,..,..,..,..,...,13314,26989,66054,289923,622142,1027286.0,1338370.0,1530873.0,1860276.0,2127874
209,China,CHN,Secure Internet servers (per 1 million people),IT.NET.SECR.P6,..,..,..,..,..,..,...,9.70507194611695,19.5592306465873,47.5965383811672,207.64925172699,443.51278907297,729.738695573417,948.458649280703,1083.91132572432,1317.31265600935,1508.37096213963


In [9]:
final_df = df[df['series_code'] == 'IT.NET.USER.ZS']

final_df = final_df.drop(['country_code', 'series_name', 'series_code'], axis=1)

In [10]:
final_df = final_df.melt(id_vars=['country_name'], value_name='pct_internet_users', var_name='years')

In [11]:
final_df.head()

Unnamed: 0,country_name,years,pct_internet_users
0,China,1990,0.0
1,Thailand,1990,0.0
2,United Kingdom,1990,0.087355319
3,United States,1990,0.784728502
4,Viet Nam,1990,0.0


In [12]:
final_df['years'] = pd.to_numeric(final_df['years'], errors='coerce')

final_df['pct_internet_users'] = pd.to_numeric(final_df['pct_internet_users'], errors='coerce')

final_df['pct_internet_users'] = final_df['pct_internet_users'].interpolate()

final_df.head(1)

Unnamed: 0,country_name,years,pct_internet_users
0,China,1990,0.0


In [13]:
from plotly.graph_objs import Legend


px.line(data_frame=final_df, x='years', y='pct_internet_users', line_group=final_df['country_name'], color=final_df['country_name'])

In [14]:
import sys
sys.path.append(r'C:\Users\Alvin Nguyen\OneDrive\1. Project\Practice\Internet\function')

In [15]:
from clean_data import clean_data

path = r'C:\Users\Alvin Nguyen\OneDrive\1. Project\Practice\Internet\data\worldbank-country-internet-data.csv'
internet_stats = clean_data(path=path, series_code='IT.NET.USER.ZS', country_code=['VNM', 'MYS', 'SGP', 'THA'])

px.line(internet_stats, x='years', y='pct_internet_users', line_group=internet_stats['country_name'], color=internet_stats['country_name'], title='Internet usage over time')

In [16]:
internet_stats[internet_stats['years'] == 2023]

Unnamed: 0,country_name,years,pct_internet_users
132,Malaysia,2023,97.6927
133,Singapore,2023,94.2863
134,Thailand,2023,89.5352
135,Viet Nam,2023,78.08


### Percentage change over time by each country

In [53]:
path = (r'C:\Users\Alvin Nguyen\OneDrive\1. Project\Practice\Internet\data\worldbank-country-internet-data.csv')

countries = df['country_code'].unique()

clean_df = clean_data(path=path, series_code='IT.NET.USER.ZS', country_code=countries)


In [None]:
clean_df = clean_df.dropna()

clean_df = clean_df.pivot(columns='country_name', index='years', values='pct_internet_users')

In [90]:
clean_df = clean_df.pct_change()

# clean_df.iloc[::5]

filtered_columns = [col for col in clean_df.columns 
                    if col in ['Viet Nam', 'Thailand', 'Singapore'] 
                    or 'United' in col]

In [110]:
results = clean_df.loc[::5]

results = results.astype(float).round(2)

results

country_name,Afghanistan,Africa Eastern and Southern,Africa Western and Central,Albania,Algeria,American Samoa,Andorra,Angola,Antigua and Barbuda,Arab World,...,Uzbekistan,Vanuatu,"Venezuela, RB",Viet Nam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990,,,,,,,,,,,...,,,,,,,,,,
1995,,,,,,,,,,,...,,,,,,,,,,
2000,,,,,,,,,,,...,,,,,,,,,,
2005,,4.42,,,,,,,,-1.0,...,,,-4.25,,,,-1.0,,,
2010,-0.47,-16.01,2.99,8.53,-11.72,,-1.01,-0.27,-58919.64,-0.77,...,-1.0,-41.06,-0.79,16.95,-259.52,0.58,-1043.69,16008.97,-46439.45,-26073.32
2015,-1.01,-0.3,1.74,-1.06,-1.0,,-962407300.0,444.23,-1.24,-1.95,...,-1.0,-1953.56,-0.22,-1.11,-1.24,5.53,43.51,-0.04,-1.0,-1.0
2020,-1.0,-16.13,-1.03,79.14,-1.41,,-1461.31,10.34,-3.78,-1.0,...,-3.89,-1.25,0.07,-930.73,0.15,-9.11,-1.0,787.12,-1.1,-1.18


In [109]:
SELECTED_YEAR = 2020

top_5_indices = results.loc[SELECTED_YEAR].nlargest(5).index
top_5_values = results.loc[SELECTED_YEAR].nlargest(5).values

final_result = pd.DataFrame({'country_name': top_5_indices, 'pct_change': top_5_values})

final_result


Unnamed: 0,country_name,pct_change
0,Ireland,inf
1,St. Kitts and Nevis,388947600.0
2,Greenland,69990920.0
3,Small states,17012230.0
4,Europe & Central Asia (IDA & IBRD countries),401563.8
