### Throughout the final analysis, we will use only the final dataset wp_scored_city_articles_by_state.csv that we obtained from previous code files.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

df_final = pd.read_csv('wp_scored_city_articles_by_state_pre.csv')
df_final = df_final.rename(columns={'regional_division':'region'})
df_final.head()

Unnamed: 0,article_title,revision_id,article_quality,state,population,region
0,"Abbeville, Alabama",1171163550,C,Alabama,5074296,South
1,"Adamsville, Alabama",1177621427,C,Alabama,5074296,South
2,"Addison, Alabama",1168359898,C,Alabama,5074296,South
3,"Akron, Alabama",1165909508,GA,Alabama,5074296,South
4,"Alabaster, Alabama",1179139816,C,Alabama,5074296,South


In [2]:
regional_div = pd.read_csv('US States by Region in Table - US Census Bureau.csv')
regional_div

Unnamed: 0,REGION,DIVISION,STATE
0,Northeast,New England,Connecticut
1,Northeast,New England,Maine
2,Northeast,New England,Massachusetts
3,Northeast,New England,New Hampshire
4,Northeast,New England,Rhode Island
5,Northeast,New England,Vermont
6,Northeast,Middle Atlantic,New Jersey
7,Northeast,Middle Atlantic,New York
8,Northeast,Middle Atlantic,Pennsylvania
9,Midwest,East North Central,Illinois


In [3]:
# regional_div['regional_division'] = regional_div['REGION'] + ' ' + regional_div['DIVISION']
regional_div = regional_div.drop(columns=['REGION'])
regional_div = regional_div.rename(columns={'STATE':'state', 'DIVISION':'regional_division'})
regional_div

Unnamed: 0,regional_division,state
0,New England,Connecticut
1,New England,Maine
2,New England,Massachusetts
3,New England,New Hampshire
4,New England,Rhode Island
5,New England,Vermont
6,Middle Atlantic,New Jersey
7,Middle Atlantic,New York
8,Middle Atlantic,Pennsylvania
9,East North Central,Illinois


In [4]:
df_final = pd.merge(df_final, regional_div, on=['state'])
df_final

Unnamed: 0,article_title,revision_id,article_quality,state,population,region,regional_division
0,"Abbeville, Alabama",1171163550,C,Alabama,5074296,South,East South Central
1,"Adamsville, Alabama",1177621427,C,Alabama,5074296,South,East South Central
2,"Addison, Alabama",1168359898,C,Alabama,5074296,South,East South Central
3,"Akron, Alabama",1165909508,GA,Alabama,5074296,South,East South Central
4,"Alabaster, Alabama",1179139816,C,Alabama,5074296,South,East South Central
...,...,...,...,...,...,...,...
21520,"Wamsutter, Wyoming",1169591845,GA,Wyoming,581381,West,Mountain
21521,"Wheatland, Wyoming",1176370621,GA,Wyoming,581381,West,Mountain
21522,"Worland, Wyoming",1166347917,GA,Wyoming,581381,West,Mountain
21523,"Wright, Wyoming",1166334449,GA,Wyoming,581381,West,Mountain


## Coverage of articles by states

In [5]:
# Since every value of population is the same for each state, then max is just the population of the state

pop_by_state = df_final.groupby(by=['state'])['population'].max().reset_index()
pop_by_state

Unnamed: 0,state,population
0,Alabama,5074296
1,Alaska,733583
2,Arizona,7359197
3,Arkansas,3045637
4,California,39029342
5,Colorado,5839926
6,Delaware,1018396
7,Florida,22244823
8,Georgia,10912876
9,Hawaii,1440196


In [6]:
total_articles = df_final.groupby(by=['state']).count().reset_index()[['state', 'article_title']]
total_articles = total_articles.rename(columns={"article_title":"article_count"})
total_articles = pd.merge(total_articles, pop_by_state, on='state')
total_articles['per_capita'] = total_articles['article_count'] / total_articles['population']

bottom10_coverage = total_articles.sort_values(by='per_capita')[:10].reset_index()
top10_coverage = total_articles.sort_values(by='per_capita', ascending=False)[:10].reset_index()

### Top 10 US states by coverage

The 10 US states with the highest total articles per capita (in descending order) .

In [7]:
top10_coverage

Unnamed: 0,index,state,article_count,population,per_capita
0,42,Vermont,329,647064,0.000508
1,31,North Dakota,356,779261,0.000457
2,17,Maine,483,1385340,0.000349
3,38,South Dakota,311,909824,0.000342
4,13,Iowa,1043,3200517,0.000326
5,1,Alaska,149,733583,0.000203
6,35,Pennsylvania,2556,12972008,0.000197
7,20,Michigan,1773,10034113,0.000177
8,47,Wyoming,99,581381,0.00017
9,26,New Hampshire,234,1395231,0.000168


In [8]:
# plt.figure(figsize=(10,6))
# plt.xticks(rotation='vertical')
# plt.title('Top 10 US States by Coverage')
# plt.legend(title='Parameter where:')
# plt.xlabel("US States")
# plt.ylabel("Total Articles Per Capita")
# plt.ylim([0,0.0006])
# plt.plot(top10_coverage['state'], top10_coverage['per_capita'])
# plt.plot(top10_coverage['state'], top10_coverage['population'] / 10e10)

### Bottom 10 US states by coverage
The 10 US states with the lowest total articles per capita (in ascending order) .

In [9]:
bottom10_coverage

Unnamed: 0,index,state,article_count,population,per_capita
0,30,North Carolina,50,10698973,5e-06
1,25,Nevada,19,3177772,6e-06
2,4,California,482,39029342,1.2e-05
3,2,Arizona,91,7359197,1.2e-05
4,43,Virginia,133,8683619,1.5e-05
5,7,Florida,412,22244823,1.9e-05
6,33,Oklahoma,75,4019800,1.9e-05
7,14,Kansas,63,2937150,2.1e-05
8,18,Maryland,157,6164660,2.5e-05
9,46,Wisconsin,192,5892539,3.3e-05


In [10]:
# plt.figure(figsize=(10,6))
# plt.xticks(rotation='vertical')
# plt.title('Bottom 10 US States by Coverage')
# plt.legend(title='Parameter where:')
# plt.xlabel("US States")
# plt.ylabel("Total Articles Per Capita")
# # plt.ylim([0,0.06])
# plt.plot(bottom10_coverage['state'], bottom10_coverage['per_capita'])
# plt.plot(bottom10_coverage['state'], bottom10_coverage['population'] / 10e10, label='Ppopulation in 10 Billion')

## Analysis of articles by states and high quality - take FA & GA as high quality

In [11]:
high_quality = df_final[(df_final['article_quality']=='FA') | (df_final['article_quality']=='GA')]

In [12]:
high_quality

Unnamed: 0,article_title,revision_id,article_quality,state,population,region,regional_division
3,"Akron, Alabama",1165909508,GA,Alabama,5074296,South,East South Central
6,"Alexander City, Alabama",1179140073,GA,Alabama,5074296,South,East South Central
7,"Aliceville, Alabama",1167792390,GA,Alabama,5074296,South,East South Central
14,"Ardmore, Alabama",1176903479,GA,Alabama,5074296,South,East South Central
33,"Bear Creek, Alabama",1166015184,GA,Alabama,5074296,South,East South Central
...,...,...,...,...,...,...,...
21517,"Torrington, Wyoming",1171948527,GA,Wyoming,581381,West,Mountain
21520,"Wamsutter, Wyoming",1169591845,GA,Wyoming,581381,West,Mountain
21521,"Wheatland, Wyoming",1176370621,GA,Wyoming,581381,West,Mountain
21522,"Worland, Wyoming",1166347917,GA,Wyoming,581381,West,Mountain


In [13]:
group_high_quality = high_quality.groupby(by='state').count().reset_index()[['state','article_title']]
group_high_quality = group_high_quality.rename(columns={'article_title':'high_quality_count'})
group_high_quality = pd.merge(group_high_quality, pop_by_state, on='state')
group_high_quality['per_capita'] = group_high_quality['high_quality_count'] / group_high_quality['population']

bottom10_coverage_high = group_high_quality.sort_values(by='per_capita')[:10].reset_index().drop(columns=['index'])
top10_coverage_high = group_high_quality.sort_values(by='per_capita', ascending=False)[:10].reset_index().drop(columns=['index'])

### Top 10 US states by high quality
The 10 US states with the highest high quality articles per capita (in descending order).

In [14]:
top10_coverage_high

Unnamed: 0,state,high_quality_count,population,per_capita
0,Vermont,45,647064,7e-05
1,Wyoming,39,581381,6.7e-05
2,South Dakota,56,909824,6.2e-05
3,West Virginia,105,1775156,5.9e-05
4,Montana,55,1122867,4.9e-05
5,New Hampshire,63,1395231,4.5e-05
6,Pennsylvania,566,12972008,4.4e-05
7,Missouri,263,6177957,4.3e-05
8,Alaska,31,733583,4.2e-05
9,New Jersey,379,9261699,4.1e-05


### Bottom 10 US states by high quality
The 10 US states with the lowest high quality articles per capita (in ascending order).

In [15]:
bottom10_coverage_high

Unnamed: 0,state,high_quality_count,population,per_capita
0,North Carolina,20,10698973,2e-06
1,Virginia,18,8683619,2e-06
2,Nevada,8,3177772,3e-06
3,Arizona,24,7359197,3e-06
4,California,174,39029342,4e-06
5,Florida,118,22244823,5e-06
6,New York,111,19677151,6e-06
7,Maryland,42,6164660,7e-06
8,Kansas,22,2937150,7e-06
9,Oklahoma,32,4019800,8e-06


## Censes by divisions

In [16]:
region_pop = df_final.groupby(by=['state', 'regional_division']).mean().reset_index().groupby(by='regional_division').sum()
region_pop = region_pop.reset_index()[['regional_division', 'population']]
region_pop

Unnamed: 0,regional_division,population
0,East North Central,47097779.0
1,East South Central,19578002.0
2,Middle Atlantic,41910858.0
3,Mountain,25514320.0
4,New England,11503343.0
5,Pacific,53229044.0
6,South Atlantic,66781137.0
7,West North Central,19721893.0
8,West South Central,41685250.0


In [17]:
pop2022 = pd.read_csv('NST-EST2022-ALLDATA.csv')

In [18]:
pop2022 = pop2022[['NAME', 'POPESTIMATE2022']]

In [19]:
pop2022

Unnamed: 0,NAME,POPESTIMATE2022
0,United States,333287557
1,Northeast Region,57040406
2,New England,15129548
3,Middle Atlantic,41910858
4,Midwest Region,68787595
...,...,...
61,Washington,7785786
62,West Virginia,1775156
63,Wisconsin,5892539
64,Wyoming,581381


In [20]:
wholedivision_pop = df_final.groupby(by=['state', 'region']).mean().reset_index().groupby(by='region').sum()
wholedivision_pop = wholedivision_pop.reset_index()[['region', 'population']]
wholedivision_pop

Unnamed: 0,region,population
0,Midwest,66819672.0
1,Northeast,53414201.0
2,South,128044389.0
3,West,78743364.0


In [21]:
# Connecticut is in Northeast Region
# Nebraska is in Midwest region
regions = list(wholedivision_pop['region'] + ' Region')
for i in range(pop2022.shape[0]):
    if pop2022.iloc[i]['NAME'] in regions:
        print(pop2022.iloc[i]['NAME'], pop2022.iloc[i]['POPESTIMATE2022'])

Northeast Region 57040406
Midwest Region 68787595
South Region 128716192
West Region 78743364


In [22]:
diffSouth = (list(pop2022[pop2022['NAME']=='South Region']['POPESTIMATE2022'])[0] - list(wholedivision_pop[wholedivision_pop['region']=='South']['population'])[0])
print('Gap in South Region from different sources of data: ' + str(diffSouth))
for i in range(pop2022.shape[0]):
    if pop2022.iloc[i]['POPESTIMATE2022']==diffSouth:
        print('Population in ' + pop2022.iloc[i]['NAME'] + ':', pop2022.iloc[i]['POPESTIMATE2022'])


Gap in South Region from different sources of data: 671803.0
Population in District of Columbia: 671803


In [23]:
popConnecticut = list(pop2022[pop2022['NAME']=='Connecticut']['POPESTIMATE2022'])[0]
diffNortheast = (list(pop2022[pop2022['NAME']=='Northeast Region']['POPESTIMATE2022'])[0] - list(wholedivision_pop[wholedivision_pop['region']=='Northeast']['population'])[0])
print('Gap in Northeast Region from different sources of data: ' + str(diffNortheast))
print('Population in Connecticut: '+ str(popConnecticut))

popNebraska = list(pop2022[pop2022['NAME']=='Nebraska']['POPESTIMATE2022'])[0]
diffMidwest = (list(pop2022[pop2022['NAME']=='Midwest Region']['POPESTIMATE2022'])[0] - list(wholedivision_pop[wholedivision_pop['region']=='Midwest']['population'])[0])
print('Gap in Midwest Region from different sources of data: ' + str(diffMidwest))
print('Population in Nebraska: '+ str(popNebraska))

Gap in Northeast Region from different sources of data: 3626205.0
Population in Connecticut: 3626205
Gap in Midwest Region from different sources of data: 1967923.0
Population in Nebraska: 1967923


### From the comparison between the populations within the four regions from our DataFrame and the U.S. Census Bureau, we notice that the populations in Northeast, Midwest and South regions are not the same. We also validate that the differences in the populations of Northeast region from the two sources and that in the populations of Midwest from the two sources are due to the exclusion of Connecticut and Nebraska in df_final. We also validate the differents in the South region comes from District of Columbia.

### In order to provide further reproducibility, the population data from both sources will be used to show differences

### Census divisions by total coverage
A rank ordered list of US census divisions (in descending order) by total articles per capita.

In [24]:
region_pop

Unnamed: 0,regional_division,population
0,East North Central,47097779.0
1,East South Central,19578002.0
2,Middle Atlantic,41910858.0
3,Mountain,25514320.0
4,New England,11503343.0
5,Pacific,53229044.0
6,South Atlantic,66781137.0
7,West North Central,19721893.0
8,West South Central,41685250.0


### Here we could look into the details of the differences between the population from the US Census Bureau and own calculation

In [25]:
bureau_region_pop = pd.merge(region_pop, pop2022, left_on='regional_division', right_on='NAME')
bureau_region_pop

Unnamed: 0,regional_division,population,NAME,POPESTIMATE2022
0,East North Central,47097779.0,East North Central,47097779
1,East South Central,19578002.0,East South Central,19578002
2,Middle Atlantic,41910858.0,Middle Atlantic,41910858
3,Mountain,25514320.0,Mountain,25514320
4,New England,11503343.0,New England,15129548
5,Pacific,53229044.0,Pacific,53229044
6,South Atlantic,66781137.0,South Atlantic,67452940
7,West North Central,19721893.0,West North Central,21689816
8,West South Central,41685250.0,West South Central,41685250


In [26]:
total_by_region = df_final.groupby(by='regional_division').count().reset_index()[['regional_division', 'article_title']]
total_by_region = total_by_region.rename(columns={'article_title':'article_count'})
total_by_region

Unnamed: 0,regional_division,article_count
0,East North Central,4754
1,East South Central,1529
2,Middle Atlantic,3781
3,Mountain,1189
4,New England,1437
5,Pacific,1304
6,South Atlantic,1850
7,West North Central,3578
8,West South Central,2103


### Below is the table of the per capita of the total articles in each regional division using calculated population, i.e. excluding the states not included in the dataset

In [27]:
total_by_region_cal = pd.merge(total_by_region, region_pop, on='regional_division')
total_by_region_cal['per_capita'] = total_by_region_cal['article_count']/total_by_region_cal['population']
total_by_region_cal.sort_values(by='per_capita', ascending=False)

Unnamed: 0,regional_division,article_count,population,per_capita
7,West North Central,3578,19721893.0,0.000181
4,New England,1437,11503343.0,0.000125
0,East North Central,4754,47097779.0,0.000101
2,Middle Atlantic,3781,41910858.0,9e-05
1,East South Central,1529,19578002.0,7.8e-05
8,West South Central,2103,41685250.0,5e-05
3,Mountain,1189,25514320.0,4.7e-05
6,South Atlantic,1850,66781137.0,2.8e-05
5,Pacific,1304,53229044.0,2.4e-05


### Below is the table of the per capita of total articles in each regional division using the population data from the US Census Bureau, i.e. including the states not included in the dataset

In [28]:
total_by_region_bureau = pd.merge(total_by_region, bureau_region_pop, on='regional_division')
total_by_region_bureau['per_capita'] = total_by_region_bureau['article_count']/total_by_region_bureau['POPESTIMATE2022']
total_by_region_bureau.sort_values(by='per_capita', ascending=False)

Unnamed: 0,regional_division,article_count,population,NAME,POPESTIMATE2022,per_capita
7,West North Central,3578,19721893.0,West North Central,21689816,0.000165
0,East North Central,4754,47097779.0,East North Central,47097779,0.000101
4,New England,1437,11503343.0,New England,15129548,9.5e-05
2,Middle Atlantic,3781,41910858.0,Middle Atlantic,41910858,9e-05
1,East South Central,1529,19578002.0,East South Central,19578002,7.8e-05
8,West South Central,2103,41685250.0,West South Central,41685250,5e-05
3,Mountain,1189,25514320.0,Mountain,25514320,4.7e-05
6,South Atlantic,1850,66781137.0,South Atlantic,67452940,2.7e-05
5,Pacific,1304,53229044.0,Pacific,53229044,2.4e-05


### Census divisions by high quality coverage
Rank ordered list of US census divisions (in descending order) by high quality articles per capita.

In [29]:
region_highquality = high_quality.groupby(by='regional_division').count().reset_index()[['regional_division', 'article_title']]

In [30]:
region_highquality = region_highquality.rename(columns={'article_title':'article_count'})
region_highquality

Unnamed: 0,regional_division,article_count
0,East North Central,711
1,East South Central,315
2,Middle Atlantic,1056
3,Mountain,336
4,New England,225
5,Pacific,491
6,South Atlantic,524
7,West North Central,638
8,West South Central,638


### Below is the table of the per capita of high quality articles in each regional division using calculated population, i.e. excluding the states not included in the dataset

In [31]:
region_highquality_cal = pd.merge(region_highquality, region_pop, on='regional_division')
region_highquality_cal['per_capita'] = region_highquality_cal['article_count']/region_highquality_cal['population']
region_highquality_cal.sort_values(by='per_capita', ascending=False)

Unnamed: 0,regional_division,article_count,population,per_capita
7,West North Central,638,19721893.0,3.2e-05
2,Middle Atlantic,1056,41910858.0,2.5e-05
4,New England,225,11503343.0,2e-05
1,East South Central,315,19578002.0,1.6e-05
8,West South Central,638,41685250.0,1.5e-05
0,East North Central,711,47097779.0,1.5e-05
3,Mountain,336,25514320.0,1.3e-05
5,Pacific,491,53229044.0,9e-06
6,South Atlantic,524,66781137.0,8e-06


### Below is the table of the per capita of high quality articles in each regional division using the population data from the US Census Bureau, i.e. including the states not included in the dataset

In [33]:
region_highquality_bureau = pd.merge(region_highquality, bureau_region_pop, on='regional_division')
region_highquality_bureau['per_capita'] = region_highquality_bureau['article_count']/region_highquality_bureau['POPESTIMATE2022']
region_highquality_bureau.sort_values(by='per_capita', ascending=False)

Unnamed: 0,regional_division,article_count,population,NAME,POPESTIMATE2022,per_capita
7,West North Central,638,19721893.0,West North Central,21689816,2.9e-05
2,Middle Atlantic,1056,41910858.0,Middle Atlantic,41910858,2.5e-05
1,East South Central,315,19578002.0,East South Central,19578002,1.6e-05
8,West South Central,638,41685250.0,West South Central,41685250,1.5e-05
0,East North Central,711,47097779.0,East North Central,47097779,1.5e-05
4,New England,225,11503343.0,New England,15129548,1.5e-05
3,Mountain,336,25514320.0,Mountain,25514320,1.3e-05
5,Pacific,491,53229044.0,Pacific,53229044,9e-06
6,South Atlantic,524,66781137.0,South Atlantic,67452940,8e-06


In [34]:
df_final = df_final.drop(columns=['region'])
df_final.to_csv('wp_scored_city_articles_by_state.csv', index=False)

Unnamed: 0,article_title,revision_id,article_quality,state,population,region,regional_division
0,"Abbeville, Alabama",1171163550,C,Alabama,5074296,South,East South Central
1,"Adamsville, Alabama",1177621427,C,Alabama,5074296,South,East South Central
2,"Addison, Alabama",1168359898,C,Alabama,5074296,South,East South Central
3,"Akron, Alabama",1165909508,GA,Alabama,5074296,South,East South Central
4,"Alabaster, Alabama",1179139816,C,Alabama,5074296,South,East South Central
...,...,...,...,...,...,...,...
21520,"Wamsutter, Wyoming",1169591845,GA,Wyoming,581381,West,Mountain
21521,"Wheatland, Wyoming",1176370621,GA,Wyoming,581381,West,Mountain
21522,"Worland, Wyoming",1166347917,GA,Wyoming,581381,West,Mountain
21523,"Wright, Wyoming",1166334449,GA,Wyoming,581381,West,Mountain
