# Preparation of aggregated COVID-19 data 

In [1]:
import pandas as pd
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
df = pd.read_excel('covid.xlsx', index_col=None)
# For some reason the excel file has a ton of empty rows at the end, so cut those off
df = df[:5613]

  warn(msg)


In [2]:
import datetime
def year_fraction(date):
    start = datetime.date(date.year, 1, 1).toordinal()
    year_length = datetime.date(date.year+1, 1, 1).toordinal() - start
    return date.year + float(date.toordinal() - start) / year_length

In [3]:
df.shape

(5613, 34)

In [4]:
df.columns

Index(['s_no', 'Reported_On', 'Additional_Reporting', 'Retrieve_from_1', 'Retrieve_from_2', 'Retrieve_from_3', 'Twitter_Reference', 'Direct_Post_1', 'Direct_Post_2', 'Direct_Post_3', 'Direct_Post_4', 'Title', 'Publication_Date', 'Entry_Date', 'Primary_Country', 'Secondary_Country', 'Primary_Language', 'Secondary_Language', 'Main_Narrative', 'Recoded_Main_Narrative', 'Recoded_Narrative_Coder', 'Narrative_Description', 'ChainMsg_or_Email', 'Motive', 'Motive_Description', 'Source', 'Source_Description', 'Distrib_Channel', 'Misinfo_Type', 'Key_Words', 'Summary', 'Coder', 'Notes', 'Region'], dtype='object')

## There are a LOT of poorly written country names. The below output shows how many of them are combined, or written with "(ambiguous)", or even misspelled. As well, for some reason, the "region" column in the original dataset is blank, even though this was used by the researchers. Here I use a large dictionary to backfill it.

In [5]:
df["Primary_Country"].unique()

array(['India', 'China', 'Canada', 'United States', 'Japan', 'France',
       'Taiwan, Province of China', 'Russian Federation',
       'Iran, Islamic Republic of', 'Iraq', 'South Africa',
       'Korea, Republic of', 'United Arab Emirates', 'Australia', 'Italy',
       'Nigeria', 'Philippines', 'Kenya', 'Spain', 'Latvia',
       'Venezuela, Bolivarian Republic of', 'Czech Republic',
       'United Kingdom', 'Israel', 'Thailand', 'Armenia', 'Belarus',
       'Ambiguous', 'Sri Lanka', 'Hong Kong', 'Philippines, Sri Lanka',
       'Singapore', 'Pakistan', 'Indonesia, Philippines',
       'India, Canada, Philippines', 'Indonensia', 'Cambodia',
       'US, Ambiguous', 'Estonia', 'Poland', 'China (Hong Kong)',
       'Philippines, Kenya, Nigeria, Spain', 'Germany', 'Afghanistan',
       'Kenya, Nigeria, South Africa, Uganda', 'Ethiopia', 'Indonesia',
       'Thailand, Ambiguous', 'Australia, ambiguous', 'China, ambiguous',
       'Malaysia', 'Zimbabwe', 'Indonesia, Malaysia, Ambiguous',
   

In [6]:
region_dict = {
    "Angola":"Africa",
    "Burkina Faso":"Africa",
    "Burundi":"Africa",
    "Benin":"Africa",
    "Botswana":"Africa",
    "Congo, The Democratic Republic of the":"Africa",
    "Central African Republic":"Africa",
    "Congo":"Africa",
    "Côte D'Ivoire":"Africa",
    "Cameroon":"Africa",
    "Cape Verde":"Africa",
    "Western Sahara":"Africa",
    "Eritrea":"Africa",
    "Ethiopia":"Africa",
    "Gabon":"Africa",
    "Ghana":"Africa",
    "Gambia":"Africa",
    "Guinea":"Africa",
    "Equatorial Guinea":"Africa",
    "Guinea-Bissau":"Africa",
    "Kenya":"Africa",
    "Liberia":"Africa",
    "Lesotho":"Africa",
    "Madagascar":"Africa",
    "Mali":"Africa",
    "Mauritius":"Africa",
    "Malawi":"Africa",
    "Mozambique":"Africa",
    "Namibia":"Africa",
    "Niger":"Africa",
    "Nigeria":"Africa",
    "Rwanda":"Africa",
    "Seychelles":"Africa",
    "South Sudan":"Africa",
    "Saint Helena":"Africa",
    "Sierra Leone":"Africa",
    "Senegal":"Africa",
    "Sao Tome and Principe":"Africa",
    "Eswatini":"Africa",
    "Chad":"Africa",
    "Togo":"Africa",
    "Tanzania, United Republic of":"Africa",
    "Uganda":"Africa",
    "Mayotte":"Africa",
    "South Africa":"Africa",
    "Zambia":"Africa",
    "Zimbabwe":"Africa",
    "Bahrain":"Middle east",
    "Djibouti":"Africa",
    "Algeria":"Africa",
    "Comoros":"Africa",
    "Morocco":"Africa",
    "Mauritania":"Africa",
    "Palestinian Territory":"Middle east",
    "Sudan":"Africa",
    "Somalia":"Africa",
    "Tunisia":"Africa",
    "Australia":"Asia & Pacific",
    "Azerbaijan":"Asia & Pacific",
    "Japan":"Asia & Pacific",
    "Korea, Republic of":"Asia & Pacific",
    "Kazakhstan":"Asia & Pacific",
    "New Zealand":"Asia & Pacific",
    "Singapore":"Asia & Pacific",
    "Taiwan":"China & Taiwan",
    "Afghanistan":"Asia & Pacific",
    "Antarctica":"Asia & Pacific",
    "American Samoa":"Asia & Pacific",
    "Bangladesh":"Asia & Pacific",
    "Brunei Darussalam":"Asia & Pacific",
    "Bhutan":"Asia & Pacific",
    "Cocos (Keeling) Islands":"Asia & Pacific",
    "Cook Islands":"Asia & Pacific",
    "China":"China & Taiwan",
    "Christmas Island":"Asia & Pacific",
    "Fiji":"Asia & Pacific",
    "Micronesia, Federated States of":"Asia & Pacific",
    "Guam":"Asia & Pacific",
    "Hong Kong":"Asia & Pacific",
    "Heard Island and McDonald Islands":"Asia & Pacific",
    "Indonesia":"Asia & Pacific",
    "India":"Asia & Pacific",
    "British Indian Ocean Territory":"Asia & Pacific",
    "Kyrgyzstan":"Asia & Pacific",
    "Cambodia":"Asia & Pacific",
    "Kiribati":"Asia & Pacific",
    "Korea, Democratic People's Republic of":"Asia & Pacific",
    "Lao People's Democratic Republic":"Asia & Pacific",
    "Sri Lanka":"Asia & Pacific",
    "Marshall Islands":"Asia & Pacific",
    "Myanmar":"Asia & Pacific",
    "Mongolia":"Asia & Pacific",
    "Macau":"Asia & Pacific",
    "Northern Mariana Islands":"Asia & Pacific",
    "Maldives":"Asia & Pacific",
    "Malaysia":"Asia & Pacific",
    "New Caledonia":"Asia & Pacific",
    "Norfolk Island":"Asia & Pacific",
    "Nepal":"Asia & Pacific",
    "Nauru":"Asia & Pacific",
    "Niue":"Asia & Pacific",
    "French Polynesia":"Asia & Pacific",
    "Papua New Guinea":"Asia & Pacific",
    "Philippines":"Asia & Pacific",
    "Pakistan":"Asia & Pacific",
    "Pitcairn Islands":"Asia & Pacific",
    "Palau":"Asia & Pacific",
    "Reunion":"Asia & Pacific",
    "Solomon Islands":"Asia & Pacific",
    "Syrian Arab Republic":"Asia & Pacific",
    "French Southern Territories":"Asia & Pacific",
    "Thailand":"Asia & Pacific",
    "Tajikistan":"Asia & Pacific",
    "Tokelau":"Asia & Pacific",
    "Turkmenistan":"Asia & Pacific",
    "Tonga":"Asia & Pacific",
    "Timor-Leste":"Asia & Pacific",
    "Tuvalu":"Asia & Pacific",
    "United States Minor Outlying Islands":"Asia & Pacific",
    "Uzbekistan":"Asia & Pacific",
    "Vietnam":"Asia & Pacific",
    "Vanuatu":"Asia & Pacific",
    "Wallis and Futuna":"Asia & Pacific",
    "Samoa":"Asia & Pacific",
    "Andorra":"Europe",
    "Albania":"Europe",
    "Armenia":"Europe",
    "Austria":"Europe",
    "Bosnia and Herzegovina":"Europe",
    "Belgium":"Europe",
    "Bulgaria":"Europe",
    "Belarus":"Europe",
    "Switzerland":"Europe",
    "Cyprus":"Europe",
    "Czech Republic":"Europe",
    "Germany":"Europe",
    "Denmark":"Europe",
    "Estonia":"Europe",
    "Spain":"Europe",
    "Finland":"Europe",
    "Faroe Islands":"Europe",
    "France":"Europe",
    "France, Metropolitan":"Europe",
    "United Kingdom":"Europe",
    "Georgia":"Europe",
    "Gibraltar":"Europe",
    "Greenland":"Europe",
    "Greece":"Europe",
    "Croatia":"Europe",
    "Hungary":"Europe",
    "Ireland":"Europe",
    "Iceland":"Europe",
    "Italy":"Europe",
    "Liechtenstein":"Europe",
    "Lithuania":"Europe",
    "Luxembourg":"Europe",
    "Latvia":"Europe",
    "Monaco":"Europe",
    "Moldova, Republic of":"Europe",
    "Macedonia":"Europe",
    "Malta":"Europe",
    "Netherlands":"Europe",
    "Norway":"Europe",
    "Poland":"Europe",
    "Portugal":"Europe",
    "Romania":"Europe",
    "Russian Federation":"Russia",
    "Russian Federation":"Russia",
    "Sweden":"Europe",
    "Slovenia":"Europe",
    "Svalbard and Jan Mayen":"Europe",
    "Slovakia":"Europe",
    "San Marino":"Europe",
    "Turkey":"Turkey",
    "Ukraine":"Europe",
    "Holy See (Vatican City State)":"Europe",
    "Serbia":"Europe",
    "Montenegro":"Europe",
    "Aland Islands":"Europe",
    "Guernsey":"Europe",
    "Isle of Man":"Europe",
    "Jersey":"Europe",
    "Israel":"Middle east",
    "United Arab Emirates":"Middle east",
    "Egypt":"Middle east",
    "Iraq":"Middle east",
    "Iran, Islamic Republic of":"Middle east",
    "Jordan":"Middle east",
    "Kuwait":"Middle east",
    "Lebanon":"Middle east",
    "Libya":"Middle east",
    "Oman":"Middle east",
    "Qatar":"Middle east",
    "Saudi Arabia":"Middle east",
    "Yemen":"Middle east",
    "Canada":"Canada & United States",
    "Saint Pierre and Miquelon":"Canada & United States",
    "United States":"Canada & United States",
    "Nicaragua":"South/Latin America",
    "Antigua and Barbuda":"South/Latin America",
    "Anguilla":"South/Latin America",
    "Netherlands Antilles":"South/Latin America",
    "Argentina":"South/Latin America",
    "Aruba":"South/Latin America",
    "Barbados":"South/Latin America",
    "Bermuda":"South/Latin America",
    "Bolivia":"South/Latin America",
    "Brazil":"South/Latin America",
    "Bahamas":"South/Latin America",
    "Bouvet Island":"South/Latin America",
    "Belize":"South/Latin America",
    "Chile":"South/Latin America",
    "Colombia":"South/Latin America",
    "Costa Rica":"South/Latin America",
    "Cuba":"South/Latin America",
    "Dominica":"South/Latin America",
    "Dominican Republic":"South/Latin America",
    "Ecuador":"South/Latin America",
    "Falkland Islands (Malvinas)":"South/Latin America",
    "Grenada":"South/Latin America",
    "French Guiana":"South/Latin America",
    "Guadeloupe":"South/Latin America",
    "South Georgia and the South Sandwich Islands":"South/Latin America",
    "Guatemala":"South/Latin America",
    "Guyana":"South/Latin America",
    "Honduras":"South/Latin America",
    "Haiti":"South/Latin America",
    "Jamaica":"South/Latin America",
    "Saint Kitts and Nevis":"South/Latin America",
    "Cayman Islands":"South/Latin America",
    "Saint Lucia":"South/Latin America",
    "Martinique":"South/Latin America",
    "Montserrat":"South/Latin America",
    "Mexico":"South/Latin America",
    "Panama":"South/Latin America",
    "Peru":"South/Latin America",
    "Puerto Rico":"South/Latin America",
    "Paraguay":"South/Latin America",
    "Suriname":"South/Latin America",
    "El Salvador":"South/Latin America",
    "Turks and Caicos Islands":"South/Latin America",
    "Trinidad and Tobago":"South/Latin America",
    "Uruguay":"South/Latin America",
    "Saint Vincent and the Grenadines":"South/Latin America",
    "Venezuela":"South/Latin America",
    "Virgin Islands, British":"South/Latin America",
    "Virgin Islands, U.S.":"South/Latin America",
    "Saint Barthelemy":"South/Latin America",
    "Saint Martin":"South/Latin America",
    'Taiwan, Province of China':"China & Taiwan",
    'Venezuela, Bolivarian Republic of':'South/Latin America',
    'Ambiguous':"Other",
    'Philippines, Sri Lanka':'Asia & Pacific',
    'Indonesia, Philippines':'Asia & Pacific',
    'India, Canada, Philippines':'Other',
    'China (Hong Kong)':"China & Taiwan",
    'Philippines, Kenya, Nigeria, Spain':"Other",
    'Kenya, Nigeria, South Africa, Uganda':"Africa",
    'Australia, ambiguous':"Asia & Pacific",
    'Indonesia, Malaysia, Ambiguous':"Asia & Pacific",
    'Thailand, ambiguous':"Asia & Pacific",
    'UK':"Europe",
    'US, Canada, UK':"Other",
    'Sri Lanka, Malaysia, the Philippines, India':"Asia & Pacific",
    'Philippines, Ambiguous':"Asia & Pacific",
    'India, Sri Lanka':"Asia & Pacific",
    'Australia, Ambiguous':"Asia & Pacific",
    'Hong Kong (ambiguous)':"China & Taiwan",
    'India, Nigeria, US, Cambodia, ambiguous':"Other",
    'Sri Lanka, Philippines':"Asia & Pacific",
    'Mexico, Ambiguous':'South/Latin America',
    'Ambiguous, Australia, Pakistan, South Africa':"Other",
    'Indonensia':"Asia & Pacific",
    'US, Ambiguous':"Canada & United States",
    'Thailand, Ambiguous':"Asia & Pacific",
    'China, ambiguous':"China & Taiwan",
    'US':"Canada & United States",
    'China (ambiguous)':"China & Taiwan",
    'Australia, India, South Africa, Ambiguous':"Other",
    'Canada, Ireland':"Other",
    'Australia, Turkey, the Netherlands, ambiguous':"Other",
    'Ambiguous, Portugal, France, Russia':"Other",
    'Indonesia, Malaysia':"Asia & Pacific",
    'Pakistan (ambiguous)':"Asia & Pacific",
    'Sri Lanka, ambiguous':"Asia & Pacific",
    'Australia, North America':"Other",
    'Pakistan, ambiguous':"Asia & Pacific",
    'India, ambiguous':"Asia & Pacific",
    'Malaysia, ambiguous':"Asia & Pacific",
    'Hong Kong, Thailand, ambiguous':"Asia & Pacific",
    'US, ambiguous':"Canada & United States",
    'Nigeria (ambiguous), Ghana (ambiguous)':"Africa",
    'Pakistan, India, ambiguous':"Asia & Pacific",
    'Indonesia, ambiguous':"Asia & Pacific",
    'India, Saudi Arabia, the Philippines.':"Other",
    'Pacific':"Asia & Pacific",
    'Ethiopia, ambiguous':"Africa",
    'Pakistan, India, Singapore':"Asia & Pacific",
    'Liberia, Nigeria':"Africa",
    'Hong Kong, Ambiguous':"China & Taiwan",
    'Nigeria, Zimbabwe, Zambia, Uganda.':"Africa",
    'Indonesia, Poland':"Other",
    'Tanzania, Uganda':"Africa",
    'USA (ambiguous)':"Canada & United States",
    'US, Canada, Australia':"Other",
    'Nigeria, Kenya':"Africa",
    'Congo, the Democratic Republic of the':"Africa",
    'South Africa, Ethiopia':"Africa",
    'Nigeria, Slovakia':"Other",
    'China, Pakistan':"Asia & Pacific",
    'Malaysia, India':"Asia & Pacific",
    'Kenya, Ambiguous':"Africa",
    'Singapore, Malaysia, Indonesia':"Asia & Pacific",
    'Nigeria, ambiguous':"Africa",
    'US, Nigeria':"Other",
    'Pakistan, ambiuous':"Asia & Pacific",
    'Canada, US':"Other",
    'Bolivia, Plurinational State of':'South/Latin America',
    'Sinhala':"Asia & Pacific",
    'Palestine, State of':'Middle east',
    'Guinea, Nigeria':"Africa",
    'South Africa, India, United States':"Other",
    'Madagascar, Nigeria, Ghana':"Africa",
    'Zambia, Nigeria, Madagascar':"Africa",
    'Lesotho, Kenya':"Africa",
    'Nigeria, Ambiguous':"Africa",
    'South Africa, Nigeria, Ghana, Bangladesh':"Africa",
    'Nigeria, United States':"Other",
    'Nigeria, Ghana':"Africa",
    'Canada, South Africa':"Other",
    "Côte d'Ivoire":"Africa",
    'Argentina ':'South/Latin America',
    "":"Other" 
}

## Map the dictionary to the regions of the dataset. If the country is blank, assign it "Other".

In [7]:
df['region'] = df['Primary_Country'].map(region_dict)
df.loc[df['Primary_Country'].isnull(), 'region'] = 'Other'

In [8]:
df

Unnamed: 0,s_no,Reported_On,Additional_Reporting,Retrieve_from_1,Retrieve_from_2,Retrieve_from_3,Twitter_Reference,Direct_Post_1,Direct_Post_2,Direct_Post_3,Direct_Post_4,Title,Publication_Date,Entry_Date,Primary_Country,Secondary_Country,Primary_Language,Secondary_Language,Main_Narrative,Recoded_Main_Narrative,Recoded_Narrative_Coder,Narrative_Description,ChainMsg_or_Email,Motive,Motive_Description,Source,Source_Description,Distrib_Channel,Misinfo_Type,Key_Words,Summary,Coder,Notes,Region,region
0,1,https://www.buzzfeednews.com/article/ryanhates...,,https://www.buzzfeednews.com/article/ryanhates...,,,0.0,,,,,India Is In The Middle Of A Coronavirus YouTub...,2020-02-19,2020-03-16 00:00:00,India,,English,,Origin of the virus,,,Hindi language YouTube account saying COVID-19...,0.0,Politics,Efforts to spread false claims on the origins ...,Individual actor,General public,Youtube,Conspiracy,"Coronavirus, India, bat soup",Hindi language Youtube account suggesting COVI...,Jan,,,Asia & Pacific
1,2,https://twitter.com/Rangoli_A/status/122779241...,,,,,1.0,https://twitter.com/Rangoli_A/status/122779241...,,,,"Tweet: False reporting on ""shooting down"" peop...",2020-02-12,2020-03-16 00:00:00,China,,English,,Emergency responses,Government responses,Alaa,Twitter user posting a compilation video of pe...,0.0,Fear,"Twitter user stoking fear among other users, s...",Individual actor,General public,Twitter,False reporting,"Coronavirus, China, shooting","Tweet with video showing ""people getting shot ...",Jan,,,China & Taiwan
2,3,https://twitter.com/Woppa1Woppa/status/1220068...,,,,,1.0,https://twitter.com/Woppa1Woppa/status/1220068...,,,,"Tweet: A woman eating a delicacy, attributing ...",2020-02-03,2020-03-16 00:00:00,Canada,,English,,Origin of the virus,,,Twitter user posting a video of a celebrity ea...,0.0,Politics,Twitter user discrediting Chinese-American pop...,Individual actor,General public,Twitter,False reporting,"Coronavirus, Chinese food, bat soup","Video of an individual eating a delicacy, and ...",Jan,,,Canada & United States
3,4,https://twitter.com/FreddiGoldstein/status/123...,,,,,1.0,https://twitter.com/FreddiGoldstein/status/123...,,,,Tweet: False chain message about NYPD containm...,2020-03-12,2020-03-16 00:00:00,United States,,English,,Emergency responses,Government responses,Alaa,Chain message of NYPD containment responses.,1.0,Fear,Chain message spread to stoke fear among Ameri...,Individual actor,General public,"Media, SMS",False reporting,"Coronavirus, NYPD, containment zone",Tweet with a screenshot of chain message sugge...,Jan,,,Canada & United States
4,5,https://www.boomlive.in/health/hoax-alert-vira...,,https://www.boomlive.in/health/hoax-alert-vira...,,,0.0,,,,,Hoax Alert: Viral 'Emergency Notification' On ...,2020-01-28,2020-03-16 00:00:00,India,,English,,Emergency responses,Government responses,Alaa,Chain message of Indian Health Ministry respon...,1.0,Fear,Chain message spread to stoke fear among India...,Individual actor,General public,"Facebook, WhatsApp",False reporting,"Coronavirus, India, travel advisory",WhatsApp chain message circulating among India...,Jan,,,Asia & Pacific
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5608,5610,https://antigo.saude.gov.br/fakenews/46705-caf...,,,,,0.0,,,,,Café previne o coronavírus - É FAKE NEWS!,2020-04-10,2021-03-05 00:00:00,Brazil,,Portuguese,,False cures and preventative measures,,,Coffee can prevent COVID-19,1.0,False Hope,Have people believe an effective prevention me...,Individual actor,WhatsApp Chain,WhatsApp,Fake remedy,"Coronavirus, Brazil, Prevention, Coffee",A message chain claiming drinking hot coffee c...,Enrique,,LATAM,South/Latin America
5609,5611,https://antigo.saude.gov.br/fakenews/46776-num...,,,,,0.0,,,,,Número de óbitos por COVID é de 946 - É FAKE N...,2020-04-24,2021-03-05 00:00:00,Brazil,,Portuguese,,Other,,,The number of COVID-19 deaths is 946,1.0,Fear,Spread fear by inflating the number of deaths,Individual actor,WhatsApp Chain,WhatsApp,False reporting,"Coronavirus, Brazil, Deaths",A message chain claiming the number of deaths ...,Enrique,,LATAM,South/Latin America
5610,5612,https://antigo.saude.gov.br/fakenews/46777-mas...,,,,,0.0,,,,,Máscaras de doação da China são contaminadas c...,2020-04-24,2021-03-05 00:00:00,Brazil,,Portuguese,,Government responses,,,Facemasks donated by China contain the virus,1.0,Fear,Undermine China' aid and discourage people fro...,Individual actor,WhatsApp Chain,WhatsApp,Conspiracy,"Coronavirus, Brazil, China, Masks",A message chain claiming donations of masks ma...,Enrique,,LATAM,South/Latin America
5611,5613,https://antigo.saude.gov.br/fakenews/46779-mas...,,,,,0.0,,,,,Máscaras sem qualidade distribuídas pelo Minis...,2020-04-24,2021-03-05 00:00:00,Brazil,,Portuguese,,Government responses,,,low-quality masks were distributed by the Braz...,1.0,Politics,Make people not trust the government,Individual actor,WhatsApp Chain,WhatsApp,Conspiracy,"Coronavirus, Brazil, Masks",A message chain claiming masks distributed by ...,Enrique,,LATAM,South/Latin America


## Confirm there are no longer any entries with a blank "region" value

In [9]:
df[df["region"].isnull()]

Unnamed: 0,s_no,Reported_On,Additional_Reporting,Retrieve_from_1,Retrieve_from_2,Retrieve_from_3,Twitter_Reference,Direct_Post_1,Direct_Post_2,Direct_Post_3,Direct_Post_4,Title,Publication_Date,Entry_Date,Primary_Country,Secondary_Country,Primary_Language,Secondary_Language,Main_Narrative,Recoded_Main_Narrative,Recoded_Narrative_Coder,Narrative_Description,ChainMsg_or_Email,Motive,Motive_Description,Source,Source_Description,Distrib_Channel,Misinfo_Type,Key_Words,Summary,Coder,Notes,Region,region


In [10]:
df["region"].unique()

array(['Asia & Pacific', 'China & Taiwan', 'Canada & United States',
       'Europe', 'Russia', 'Middle east', 'Africa', 'South/Latin America',
       'Other', 'Turkey'], dtype=object)

In [11]:
df['region'].value_counts()

Asia & Pacific            1277
Europe                    1012
South/Latin America        679
China & Taiwan             541
Canada & United States     500
Africa                     491
Middle east                454
Turkey                     300
Other                      282
Russia                      77
Name: region, dtype: int64

## Check the main narrative options. As before, this is not consistent and needs to be cleaned

In [12]:
df["Main_Narrative"].unique()

array(['Origin of the virus', 'Emergency responses',
       'Weaponization or design', 'False cures and preventative measures',
       'False diagnosis procedures', 'Nature of the virus',
       'COVID-19 status of individuals', 'Other', 'Government responses',
       'Emergency Responses', 'Non-Government responses',
       'Government Responses'], dtype=object)

In [13]:
# Replace the odd values of the narratives
df.loc[df['Main_Narrative'] == 'Emergency responses','Main_Narrative'] = 'Government responses'
df.loc[df['Main_Narrative'] == 'Emergency Responses','Main_Narrative'] = 'Government responses'
df.loc[df['Main_Narrative'] == 'Government responses','Main_Narrative'] = 'Government Responses'

In [14]:
df["Main_Narrative"].unique()

array(['Origin of the virus', 'Government Responses',
       'Weaponization or design', 'False cures and preventative measures',
       'False diagnosis procedures', 'Nature of the virus',
       'COVID-19 status of individuals', 'Other',
       'Non-Government responses'], dtype=object)

## Check the motive options. As before, this is not consistent and needs to be cleaned

In [15]:
df["Motive"].unique()

array(['Politics', 'Fear', 'Profit', 'Other', 'Help', 'False Hope',
       'Undermine target country institutions ', 'Downplay Severity',
       'Undermine target country institutions', 'False hope', nan,
       'other', 'Downplay severity', 'Undermine target country ',
       'Undermine target country'], dtype=object)

In [16]:
df.loc[df['Motive'] == 'Government Responses','Motive'] = 'Politics'
df.loc[df['Motive'] == 'Undermine target country institutions ','Motive'] = 'Undermine target country institutions'
df.loc[df['Motive'] == 'Undermine target country ','Motive'] = 'Undermine target country institutions'
df.loc[df['Motive'] == 'Undermine target country','Motive'] = 'Undermine target country institutions'
df.loc[df['Motive'] == 'False hope','Motive'] = 'False Hope'
df.loc[df['Motive'] == 'Downplay severity','Motive'] = 'Downplay Severity'
df.loc[df['Motive'] == 'other','Motive'] = 'Other'

In [17]:
df["Motive"].unique()

array(['Politics', 'Fear', 'Profit', 'Other', 'Help', 'False Hope',
       'Undermine target country institutions', 'Downplay Severity', nan],
      dtype=object)

In [18]:
df.to_csv('covid_cleaned.csv', index=False)

In [19]:
df.columns

Index(['s_no', 'Reported_On', 'Additional_Reporting', 'Retrieve_from_1', 'Retrieve_from_2', 'Retrieve_from_3', 'Twitter_Reference', 'Direct_Post_1', 'Direct_Post_2', 'Direct_Post_3', 'Direct_Post_4', 'Title', 'Publication_Date', 'Entry_Date', 'Primary_Country', 'Secondary_Country', 'Primary_Language', 'Secondary_Language', 'Main_Narrative', 'Recoded_Main_Narrative', 'Recoded_Narrative_Coder', 'Narrative_Description', 'ChainMsg_or_Email', 'Motive', 'Motive_Description', 'Source', 'Source_Description', 'Distrib_Channel', 'Misinfo_Type', 'Key_Words', 'Summary', 'Coder', 'Notes', 'Region', 'region'], dtype='object')

In [20]:
df["Source"].unique()

array(['Individual actor', 'Companies', 'State sponsors', 'Media',
       'Political actor', 'Media, Political actor',
       'Political actor, media ', 'Individual actor, media',
       'Individual actor ', 'Media, Individual actor',
       'Individual actors', 'Political actors, media, individual actors',
       'Individual actors, political actors', 'Media, individual actors',
       'Political actor, state sponsors', 'state sponsors',
       'Individual actors, State sponsor', 'Individual actors, media',
       'Individual actors, Media', 'Individual actors, state sponsor',
       'Media, Individual actors', 'Individual actors, companies',
       'Media, Individual Actors', 'Companies, Individual actors',
       'Individual Actors', 'Political actors, individual actors',
       'Political actor, individual actors',
       'Individual actor, Political actor, Media', 'Media ',
       'Media, Polical actor, Individual actor', 'Individual Actor'],
      dtype=object)

In [21]:
df = df.sort_values(by=['Publication_Date'])

In [22]:
df_vis = pd.DataFrame(df.groupby(['Motive', pd.Grouper(key='Publication_Date', freq='W-MON')]).count())

In [23]:
df_vis = df_vis.reset_index()
df_vis

Unnamed: 0,Motive,Publication_Date,s_no,Reported_On,Additional_Reporting,Retrieve_from_1,Retrieve_from_2,Retrieve_from_3,Twitter_Reference,Direct_Post_1,Direct_Post_2,Direct_Post_3,Direct_Post_4,Title,Entry_Date,Primary_Country,Secondary_Country,Primary_Language,Secondary_Language,Main_Narrative,Recoded_Main_Narrative,Recoded_Narrative_Coder,Narrative_Description,ChainMsg_or_Email,Motive_Description,Source,Source_Description,Distrib_Channel,Misinfo_Type,Key_Words,Summary,Coder,Notes,Region,region
0,Downplay Severity,2020-01-27,1,1,0,0,0,0,1,0,0,0,0,1,1,1,1,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1
1,Downplay Severity,2020-02-03,3,3,0,1,0,0,3,1,0,0,0,3,3,3,0,3,0,3,1,1,3,3,3,3,3,3,3,3,3,3,2,0,3
2,Downplay Severity,2020-02-10,2,2,0,0,0,0,2,1,0,0,0,2,2,2,0,2,0,2,1,1,2,2,2,2,2,2,2,2,2,2,0,0,2
3,Downplay Severity,2020-02-17,1,1,0,0,0,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1
4,Downplay Severity,2020-03-02,4,4,0,2,0,0,4,0,0,0,0,4,4,4,1,4,0,4,0,0,4,4,4,4,4,4,4,4,4,4,1,1,4
5,Downplay Severity,2020-03-09,3,3,0,2,0,0,3,1,1,0,0,3,3,3,0,3,0,3,0,0,3,3,3,3,3,3,3,3,3,3,2,0,3
6,Downplay Severity,2020-03-16,5,5,0,3,1,0,5,5,0,0,0,5,5,5,0,5,0,5,1,1,5,5,5,5,5,5,5,5,5,5,2,1,5
7,Downplay Severity,2020-03-23,11,11,0,2,1,1,11,4,1,1,1,11,11,11,0,11,1,11,1,1,11,11,11,11,11,11,10,11,11,11,2,0,11
8,Downplay Severity,2020-03-30,4,4,0,2,0,0,4,3,1,0,0,4,4,4,0,4,1,4,0,0,4,4,4,4,4,4,4,4,4,4,0,0,4
9,Downplay Severity,2020-04-06,11,11,0,3,1,0,11,3,0,0,0,11,11,11,0,11,0,11,6,6,11,11,11,11,11,11,11,11,11,11,2,1,11


In [24]:
df_motive = df_vis[['Motive','Publication_Date', "s_no"]]
df_motive.columns = ['motive','date', "count"]

In [25]:
df_motive

Unnamed: 0,motive,date,count
0,Downplay Severity,2020-01-27,1
1,Downplay Severity,2020-02-03,3
2,Downplay Severity,2020-02-10,2
3,Downplay Severity,2020-02-17,1
4,Downplay Severity,2020-03-02,4
5,Downplay Severity,2020-03-09,3
6,Downplay Severity,2020-03-16,5
7,Downplay Severity,2020-03-23,11
8,Downplay Severity,2020-03-30,4
9,Downplay Severity,2020-04-06,11


In [26]:
# df_motive['date'] = df_motive['date'].apply(lambda x: year_fraction(x))
# df_motive['date'] = df_motive['date'].apply(year_fraction)


In [27]:
df_motive

Unnamed: 0,motive,date,count
0,Downplay Severity,2020-01-27,1
1,Downplay Severity,2020-02-03,3
2,Downplay Severity,2020-02-10,2
3,Downplay Severity,2020-02-17,1
4,Downplay Severity,2020-03-02,4
5,Downplay Severity,2020-03-09,3
6,Downplay Severity,2020-03-16,5
7,Downplay Severity,2020-03-23,11
8,Downplay Severity,2020-03-30,4
9,Downplay Severity,2020-04-06,11


In [28]:
df_motive.to_json("motive.json", orient='records', date_format='iso')

In [29]:
df_vis = pd.DataFrame(df.groupby(['Main_Narrative', pd.Grouper(key='Publication_Date', freq='W-MON')]).count())
df_vis = df_vis.reset_index()
df_narrative = df_vis[['Main_Narrative','Publication_Date', "s_no"]]
df_narrative.columns = ['narrative','date', "count"]
#df_narrative['date'] = df_narrative['date'].apply(year_fraction)
df_narrative['date'] = df_narrative.date.apply(lambda x: x.date())
#df_narrative['date'] = df['date'].dt.strftime('%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_narrative['date'] = df_narrative.date.apply(lambda x: x.date())


In [30]:
df_narrative

Unnamed: 0,narrative,date,count
0,COVID-19 status of individuals,2020-01-27,8
1,COVID-19 status of individuals,2020-02-03,18
2,COVID-19 status of individuals,2020-02-10,23
3,COVID-19 status of individuals,2020-02-17,16
4,COVID-19 status of individuals,2020-02-24,18
5,COVID-19 status of individuals,2020-03-02,33
6,COVID-19 status of individuals,2020-03-09,30
7,COVID-19 status of individuals,2020-03-16,31
8,COVID-19 status of individuals,2020-03-23,35
9,COVID-19 status of individuals,2020-03-30,34


In [31]:
df_narrative.to_json("narrative.json", orient='records', date_format='iso')

In [32]:
df_vis = pd.DataFrame(df.groupby(['region', pd.Grouper(key='Publication_Date', freq='W-MON')]).count())
df_vis = df_vis.reset_index()
df_region = df_vis[['region','Publication_Date', "s_no"]]
df_region.columns = ['region','date', "count"]
#df_region['date'] = df_region['date'].apply(year_fraction)

In [33]:
df_region

Unnamed: 0,region,date,count
0,Africa,2020-02-03,2
1,Africa,2020-02-10,5
2,Africa,2020-02-17,5
3,Africa,2020-02-24,10
4,Africa,2020-03-02,2
5,Africa,2020-03-09,7
6,Africa,2020-03-16,8
7,Africa,2020-03-23,14
8,Africa,2020-03-30,26
9,Africa,2020-04-06,26


In [34]:
df_region.to_json("region.json", orient='records', date_format='iso')

In [35]:
len(df["Source"].unique())

31

In [36]:
type(df_narrative['date'][0])

datetime.date

In [37]:
df

Unnamed: 0,s_no,Reported_On,Additional_Reporting,Retrieve_from_1,Retrieve_from_2,Retrieve_from_3,Twitter_Reference,Direct_Post_1,Direct_Post_2,Direct_Post_3,Direct_Post_4,Title,Publication_Date,Entry_Date,Primary_Country,Secondary_Country,Primary_Language,Secondary_Language,Main_Narrative,Recoded_Main_Narrative,Recoded_Narrative_Coder,Narrative_Description,ChainMsg_or_Email,Motive,Motive_Description,Source,Source_Description,Distrib_Channel,Misinfo_Type,Key_Words,Summary,Coder,Notes,Region,region
4524,4525,https://teyit.org/fotografin-koronavirus-tasiy...,,https://teyit.link/DMqpZxW,https://teyit.link/d4oYHje,https://teyit.link/ALZshjx,1.0,,,,,Fotoğrafın koronavirüs taşıyan birinin kolunu ...,2020-01-23,2020-12-21 00:00:00,Turkey,,Turkish,,Other,,,Claim that a photo shows arm of an coronavirus...,0.0,Fear,The image shows a disturbing photo of an infec...,Individual actor,General public,Twitter,False reporting,"Coronavirus, Turkey, arm, infection",Claim that a photo shows arm of an coronavirus...,Ulaş,-,,Turkey
315,316,https://www.thepaper.cn/newsDetail_forward_561...,,,,,0.0,,,,,饮用高度酒能抵抗新型冠状病毒？白岩松辟谣,2020-01-23,2020-04-06 00:00:00,China,,Mandarin,,Nature of the virus,,,Famous anchor Bai Yansong spread rumors that d...,0.0,Help,Spreads claims that drinking the alcohol with ...,Individual actor,Bai Yansong,CCTV Channel News 1+1,Fake remedy,"Coronavirus, Bai yansong, anchor, alcohol",Famous anchor Bai yangsong spread rumors that ...,Anne,,,China & Taiwan
1803,1804,http://www.xgrb.cn/xczjc1/2020-02-29/291904.html,,,,,0.0,,,,,武汉官方：“在武汉上空开始播撒消毒粉液”是谣言！\n,2020-01-23,2020-06-15 00:00:00,China,,Mandarin,,Government Responses,Government responses,Olivia,"Wuhan official: ""Starting to spread disinfecta...",1.0,Fear,Spreads fear that the Wuhan City will spill di...,Individual actor,General public,Social media,False reporting,"Coronavirus, Wuhan, disinfectant, spread, powder",Wuhan official refuted a rumor that claimed di...,Anne,,,China & Taiwan
533,534,https://m.weibo.cn/detail/4464033279702648,,,,,1.0,https://m.weibo.cn/detail/4464033279702648,,,,不切身体会真的不知道家里面真的信吃香蕉会感染肺炎……我前几天疯狂科普终于戴上口罩，结果一个网...,2020-01-23,2020-04-14 00:00:00,China,,Mandarin,,False cures and preventative measures,,,Rumors claimed people stopped eating bananas t...,1.0,Fear,Scares people not to eat bananas to prevent th...,Individual actor,Weibo user @kelly雅雅K,Weibo,Fake remedy,"Coronavirus, bananas, prevent, stopped, eating",A twitter user claimed people stopped eating b...,Anne,,,China & Taiwan
26,27,https://www.politifact.com/factchecks/2020/jan...,,,,,0.0,,,,,Says a coronavirus patent expired just as ther...,2020-01-23,2020-03-16 00:00:00,United States,,English,,Origin of the virus,,,"Outbreak coincides with a patent expiration, a...",0.0,False Hope,Conservative Facebook group claiming the outbr...,Individual actor,General public,Facebook,Conspiracy,"Coronavirus, patent, SARS, vaccine",Using an expiration for a patent associated wi...,Jan,,,Canada & United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5071,5072,https://teyit.org/analiz-israilde-bir-kisinin-...,,https://archive.is/4IMlj,https://archive.is/cTO98,https://archive.is/HUHTG,1.0,,,,,İsrail’de bir kişinin Covid-19 aşısı nedeniyle...,2020-12-31,2021-01-30 00:00:00,Turkey,Israel,Turkish,,False cures and preventative measures,,,Claim that a person had a heart attack and die...,0.0,Fear,"To spread fear about vaccines, clickbait",Media,"HABERLER.COM, Internet Haber, haber7.com, Sözc...","Website, Twitter",False reporting,"Turkey, Israel, vaccine, death",Claim that a person had a heart attack and die...,Ulaş,,,Turkey
5069,5070,https://teyit.org/analiz-asi-ureticilerinin-as...,,https://archive.is/PjsMN,https://archive.is/vLmVU,,1.0,,,,,Aşı üreticilerinin aşı olmasının yasak olduğu ...,2020-12-31,2021-01-30 00:00:00,Turkey,Germany,Turkish,German,False cures and preventative measures,,,"Claim that Uğur Şahin, the CEO of BionTech, is...",0.0,Politics,To indicate that that vaccines are not safe ar...,Individual actor,"Metin Külük, Hazar Tandoğan","Twitter, Facebook",Conspiracy,"Turkey, vaccine, mRNA, Uğur Şahin","Claim that Uğur Şahin, the CEO of BionTech, i...",Ulaş,,,Turkey
4944,4945,https://factcheck.afp.com/video-former-china-c...,,https://factcheck.afp.com/video-former-china-c...,,,1.0,,,,,Video of former China CDC director criticising...,2020-12-31,2021-01-24 00:00:00,China,,English,,False cures and preventative measures,,,A news clip has been viewed tens of thousands ...,1.0,Undermine target country institutions,Undermines vaccines that China made,Individual actor,General public,Twitter,False reporting,"Coronavirus, China, vaccine, Chinese Center fo...",A news clip has been viewed tens of thousands ...,Anne,,,China & Taiwan
4945,4946,https://factcheck.afp.com/korean-social-media-...,,https://factcheck.afp.com/korean-social-media-...,,,0.0,,,,,Korean social media posts share false claim th...,2020-12-31,2021-01-24 00:00:00,"Korea, Democratic People's Republic of",,English,,False cures and preventative measures,,,Multiple posts shared repeatedly on Facebook c...,1.0,Help,Claims that drinking tea with pepper stems can...,Individual actor,Facebook users,Facebook,False reporting,"Coronavirus, Korea, tea, pepper, stem, prevent",Korean social media posts share false claim th...,Anne,,,Asia & Pacific


## Cleaning for the story viz

In [38]:
df_plastic = pd.read_csv('mismanaged_plastic.csv')

In [39]:
df_plastic

Unnamed: 0,country,country_code,year,pw_capita,pw_total,lat,long
0,Albania,ALB,2019.0,0.543214,1565,41.000028,19.999962
1,Algeria,DZA,2019.0,0.134114,5774,28.000027,2.999983
2,Angola,AGO,2019.0,0.027023,860,-11.877577,17.569124
3,Antigua and Barbuda,ATG,2019.0,0.020619,2,17.223472,-61.955461
4,Argentina,ARG,2019.0,0.092383,4137,-34.996496,-64.967282
5,Australia,AUS,2019.0,0.00127,32,-24.776109,134.755
6,Bahamas,BHS,2019.0,0.051414,20,24.773655,-78.000055
7,Bahrain,BHR,2019.0,0.0,0,26.155125,50.534461
8,Bangladesh,BGD,2019.0,0.151123,24640,24.476929,90.293441
9,Barbados,BRB,2019.0,0.156794,45,13.150033,-59.52503


In [40]:
df_plastic["country"].unique()

array(['Albania', 'Algeria', 'Angola', 'Antigua and Barbuda', 'Argentina',
       'Australia', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belgium', 'Belize', 'Benin', 'Bosnia and Herzegovina', 'Brazil',
       'Brunei', 'Bulgaria', 'Burkina Faso', 'Cambodia', 'Cameroon',
       'Canada', 'Cape Verde', 'Chile', 'China', 'Colombia', 'Comoros',
       'Congo', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cyprus',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Fiji', 'Finland',
       'France', 'French Guiana', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Grenada', 'Guadeloupe', 'Guatemala', 'Guinea',
       'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hong Kong',
       'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland',
       'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan',
       'Kenya'

In [41]:
df["Primary_Country"].unique()

array(['Turkey', 'China', 'United States',
       'Venezuela, Bolivarian Republic of', 'Japan', 'Canada', 'Brazil',
       'Taiwan, Province of China', 'Colombia', 'Australia', 'Ambiguous',
       'Russian Federation', 'Greenland', 'Philippines', 'Thailand',
       'Northern Mariana Islands', 'Ecuador', 'Singapore', 'Belarus',
       'Germany', 'Sri Lanka', 'India', 'Pakistan', 'Spain', 'Egypt',
       'Hong Kong', 'Philippines, Sri Lanka', 'United Kingdom',
       'United Arab Emirates', 'Indonesia', 'Korea, Republic of', 'Kenya',
       'Ghana', 'Israel', 'Malaysia', 'Guam', 'Indonesia, Philippines',
       'Fiji', 'Nigeria', 'Afghanistan', 'Lebanon',
       'India, Canada, Philippines', 'South Africa', 'Iraq', 'Indonensia',
       'Malawi', 'Cambodia', 'Mexico', 'US, Ambiguous', 'Andorra',
       'China (Hong Kong)', 'France', 'Algeria',
       'Philippines, Kenya, Nigeria, Spain', 'Mali',
       'Iran, Islamic Republic of',
       'Kenya, Nigeria, South Africa, Uganda', 'Cuba', 'Su

In [42]:
res = df["Primary_Country"][~df["Primary_Country"].isin(df_plastic["country"])]

In [43]:
res.unique()

array(['Venezuela, Bolivarian Republic of', 'Taiwan, Province of China',
       'Ambiguous', 'Russian Federation', 'Greenland',
       'Northern Mariana Islands', 'Belarus', 'Philippines, Sri Lanka',
       'Korea, Republic of', 'Guam', 'Indonesia, Philippines',
       'Afghanistan', 'India, Canada, Philippines', 'Indonensia',
       'Malawi', 'US, Ambiguous', 'Andorra', 'China (Hong Kong)',
       'Philippines, Kenya, Nigeria, Spain', 'Mali',
       'Iran, Islamic Republic of',
       'Kenya, Nigeria, South Africa, Uganda', 'Cuba', 'Ethiopia',
       'Australia, ambiguous', 'Thailand, Ambiguous', 'China, ambiguous',
       'Tanzania, United Republic of', 'Palestine, State of',
       'Indonesia, Malaysia, Ambiguous', 'Thailand, ambiguous', 'Armenia',
       'US', 'Moldova, Republic of', 'Bolivia, Plurinational State of',
       'Nepal', 'US, Canada, UK',
       'Sri Lanka, Malaysia, the Philippines, India',
       'Hong Kong (ambiguous)', 'India, Nigeria, US, Cambodia, ambiguous',
   

In [44]:
country_dict = {
    "Venezuela, Bolivarian Republic of":"Venezuela",
    "Taiwan, Province of China":"Taiwan",
    "Ambiguous":"Other",
    "Russian Federation":"Russia",
    "Greenland":"Denmark",
    "Northern Mariana Islands":"United States",
    "Belarus":"Belarus",
    "Philippines, Sri Lanka":"Other",
    "Korea, Republic of":"South Korea",
    "Guam":"United States",
    "Indonesia, Philippines":"Other",
    "Afghanistan":"Afghanistan",
    "India, Canada, Philippines":"Other",
    "Indonensia":"Indonesia",
    "Malawi":"Malawi",
    "US, Ambiguous":"United States",
    "Andorra":"Andorra",
    "China (Hong Kong)":"China",
    "Philippines, Kenya, Nigeria, Spain":"Other",
    "Mali":"Mali",
    "Iran, Islamic Republic of":"Iran",
    "Kenya, Nigeria, South Africa, Uganda":"Other",
    "Cuba":"Cuba",
    "Ethiopia":"Ethiopia",
    "Australia, ambiguous":"Australia",
    "Thailand, Ambiguous":"Thailand",
    "China, ambiguous":"China",
    "Tanzania, United Republic of":"Tanzania",
    "Palestine, State of":"Palestine",
    "Indonesia, Malaysia, Ambiguous":"Other",
    "Thailand, ambiguous":"Thailand",
    "Armenia":"Armenia",
    "US":"United States",
    "Moldova, Republic of":"Moldova",
    "Bolivia, Plurinational State of":"Bolivia",
    "Nepal":"Nepal",
    "US, Canada, UK":"Other",
    "Sri Lanka, Malaysia, the Philippines, India":"Other",
    "Hong Kong (ambiguous)":"China",
    "India, Nigeria, US, Cambodia, ambiguous":"Other",
    "Uganda":"Uganda",
    "China (ambiguous)":"China",
    "Australia, India, South Africa, Ambiguous":"Other",
    "Canada, Ireland":"Other",
    "Nauru":"Nauru",
    "Czech Republic":"Czech Republic",
    "Ambiguous, Portugal, France, Russia":"Other",
    "Australia, Turkey, the Netherlands, ambiguous":"Other",
    "Bhutan":"Bhutan",
    "Pakistan (ambiguous)":"Pakistan",
    "Hong Kong, Thailand, ambiguous":"Other",
    "US, ambiguous":"United States",
    "Australia, North America":"Other",
    "Paraguay":"Paraguay",
    "Nigeria (ambiguous), Ghana (ambiguous)":"Other",
    "India, Saudi Arabia, the Philippines.":"Other",
    "UK":"United Kingdom",
    "Philippines, Ambiguous":"Philippines",
    "Australia, Ambiguous":"Australia",
    "India, Sri Lanka":"Other",
    "Congo, the Democratic Republic of the":"Congo",
    "Indonesia, Malaysia":"Other",
    "Sri Lanka, Philippines":"Other",
    "Mexico, Ambiguous":"Mexico",
    "Ambiguous, Australia, Pakistan, South Africa":"Other",
    "Syrian Arab Republic":"Syria",
    "Sri Lanka, ambiguous":"Sri Lanka",
    "Pakistan, ambiguous":"Pakistan",
    "India, ambiguous":"India",
    "Pakistan, India, ambiguous":"Other",
    "Malaysia, ambiguous":"Malaysia",
    "Indonesia, ambiguous":"Indonesia",
    "Ethiopia, ambiguous":"Ethiopia",
    "Liberia, Nigeria":"Other",
    "Pakistan, India, Singapore":"Other",
    "Hong Kong, Ambiguous":"China",
    "Indonesia, Poland":"Other",
    "Nigeria, Zimbabwe, Zambia, Uganda.":"Other",
    "USA (ambiguous)":"United States",
    "South Sudan":"South Sudan",
    "American Samoa":"United States",
    "Kenya, Ambiguous":"Kenya",
    "Rwanda":"Rwanda",
    "Pacific":"Other",
    "Tanzania, Uganda":"Other",
    "US, Canada, Australia":"Other",
    "Nigeria, Kenya":"Other",
    "South Africa, Ethiopia":"Other",
    "Nigeria, Slovakia":"Other",
    "China, Pakistan":"Other",
    "Burundi":"Burundi",
    "Malaysia, India":"Other",
    "Côte d'Ivoire":"Cote d'Ivoire",
    "Singapore, Malaysia, Indonesia":"Other",
    "Austria":"Austria",
    "US, Nigeria":"Other",
    "Nigeria, ambiguous":"Nigeria",
    "Canada, US":"Other",
    "Pakistan, ambiuous":"Pakistan",
    "Sinhala":"Sri Lanka",
    "Guinea, Nigeria":"Other",
    "South Africa, India, United States":"Other",
    "Madagascar, Nigeria, Ghana":"Other",
    "Zambia, Nigeria, Madagascar":"Other",
    "Lesotho, Kenya":"Other",
    "Nigeria, Ambiguous":"Nigeria",
    "South Africa, Nigeria, Ghana, Bangladesh":"Other",
    "Nigeria, United States":"Other",
    "Canada, South Africa":"Other",
    "Central African Republic":"Central African Republic",
    "Nigeria, Ghana":"Other",
    "Virgin Islands, U.S.":"United States",
    "Hungary":"Hungary",
    "Korea, Democratic People's Republic of":"North Korea",
    "Argentina ":"Argentina"
}

In [45]:
df["Primary_Country"].replace(country_dict, inplace=True)
df.loc[df['Primary_Country'].isnull(), 'Primary_Country'] = 'Other'
df.loc[df['Primary_Country']=="Other"].shape
df = df.reset_index()

In [46]:
res = df["Primary_Country"][~df["Primary_Country"].isin(df_plastic["country"])]

In [47]:
res.unique().shape

(23,)

In [48]:
df.loc[df['Primary_Country']=="China"].shape

(457, 36)

In [49]:
df['Primary_Country'].shape

(5613,)

In [50]:
df_grouped = (df.groupby(["Primary_Country"])
 .agg({'s_no':'count'})
 .reset_index()
 .rename(columns={'s_no':'num_fake_news', 'Primary_Country':'country'}))

In [51]:
df_plastic

Unnamed: 0,country,country_code,year,pw_capita,pw_total,lat,long
0,Albania,ALB,2019.0,0.543214,1565,41.000028,19.999962
1,Algeria,DZA,2019.0,0.134114,5774,28.000027,2.999983
2,Angola,AGO,2019.0,0.027023,860,-11.877577,17.569124
3,Antigua and Barbuda,ATG,2019.0,0.020619,2,17.223472,-61.955461
4,Argentina,ARG,2019.0,0.092383,4137,-34.996496,-64.967282
5,Australia,AUS,2019.0,0.00127,32,-24.776109,134.755
6,Bahamas,BHS,2019.0,0.051414,20,24.773655,-78.000055
7,Bahrain,BHR,2019.0,0.0,0,26.155125,50.534461
8,Bangladesh,BGD,2019.0,0.151123,24640,24.476929,90.293441
9,Barbados,BRB,2019.0,0.156794,45,13.150033,-59.52503


In [52]:
df_story = df_plastic.copy(deep=True)

In [53]:
df_story

Unnamed: 0,country,country_code,year,pw_capita,pw_total,lat,long
0,Albania,ALB,2019.0,0.543214,1565,41.000028,19.999962
1,Algeria,DZA,2019.0,0.134114,5774,28.000027,2.999983
2,Angola,AGO,2019.0,0.027023,860,-11.877577,17.569124
3,Antigua and Barbuda,ATG,2019.0,0.020619,2,17.223472,-61.955461
4,Argentina,ARG,2019.0,0.092383,4137,-34.996496,-64.967282
5,Australia,AUS,2019.0,0.00127,32,-24.776109,134.755
6,Bahamas,BHS,2019.0,0.051414,20,24.773655,-78.000055
7,Bahrain,BHR,2019.0,0.0,0,26.155125,50.534461
8,Bangladesh,BGD,2019.0,0.151123,24640,24.476929,90.293441
9,Barbados,BRB,2019.0,0.156794,45,13.150033,-59.52503


In [54]:
df_grouped.sort_values(by=['num_fake_news'])

Unnamed: 0,country,num_fake_news
28,Dominican Republic,1
75,Norway,1
2,Andorra,1
27,Denmark,1
59,Lithuania,1
60,Madagascar,1
61,Malawi,1
36,Gabon,1
80,Papua New Guinea,1
72,Nicaragua,1


In [55]:
df_grouped2 = (df.groupby(["Primary_Country", "Source"])
 .agg({'s_no':'count'})
 .reset_index()
 .rename(columns={'s_no':'num_fake_news', 'Primary_Country':'country'}))

In [56]:
df_grouped2.sort_values(by=['num_fake_news'])

Unnamed: 0,country,Source,num_fake_news
138,Iraq,Individual Actor,1
173,Lebanon,Political actor,1
175,Libya,Media,1
97,Gabon,Media,1
96,France,State sponsors,1
176,Lithuania,Media,1
177,Madagascar,Individual actor,1
178,Malawi,Media,1
183,Malaysia,Political actor,1
90,Ethiopia,"Media, individual actors",1


In [57]:
res = df["Primary_Country"][~df["Primary_Country"].isin(df_plastic["country"])]

In [58]:
res.unique().shape

(23,)

In [59]:
df_grouped

Unnamed: 0,country,num_fake_news
0,Afghanistan,28
1,Algeria,25
2,Andorra,1
3,Argentina,82
4,Armenia,6
5,Australia,94
6,Austria,6
7,Bahrain,3
8,Bangladesh,17
9,Belarus,7


In [60]:
df_story["num_fake_news"] = ""
df_story = df_story.set_index('country')
df_grouped = df_grouped.set_index('country')
df_story.update(df_grouped)
df_story.reset_index(inplace=True)
df_story

Unnamed: 0,country,country_code,year,pw_capita,pw_total,lat,long,num_fake_news
0,Albania,ALB,2019.0,0.543214,1565,41.000028,19.999962,
1,Algeria,DZA,2019.0,0.134114,5774,28.000027,2.999983,25.0
2,Angola,AGO,2019.0,0.027023,860,-11.877577,17.569124,
3,Antigua and Barbuda,ATG,2019.0,0.020619,2,17.223472,-61.955461,
4,Argentina,ARG,2019.0,0.092383,4137,-34.996496,-64.967282,82.0
5,Australia,AUS,2019.0,0.00127,32,-24.776109,134.755,94.0
6,Bahamas,BHS,2019.0,0.051414,20,24.773655,-78.000055,
7,Bahrain,BHR,2019.0,0.0,0,26.155125,50.534461,3.0
8,Bangladesh,BGD,2019.0,0.151123,24640,24.476929,90.293441,17.0
9,Barbados,BRB,2019.0,0.156794,45,13.150033,-59.52503,


In [61]:
df_story = df_story[df_story.num_fake_news != '']
df_story.reset_index(inplace=True)
df_story

Unnamed: 0,index,country,country_code,year,pw_capita,pw_total,lat,long,num_fake_news
0,1,Algeria,DZA,2019.0,0.134114,5774,28.000027,2.999983,25.0
1,4,Argentina,ARG,2019.0,0.092383,4137,-34.996496,-64.967282,82.0
2,5,Australia,AUS,2019.0,0.00127,32,-24.776109,134.755,94.0
3,7,Bahrain,BHR,2019.0,0.0,0,26.155125,50.534461,3.0
4,8,Bangladesh,BGD,2019.0,0.151123,24640,24.476929,90.293441,17.0
5,10,Belgium,BEL,2019.0,0.002947,34,50.640281,4.666715,2.0
6,14,Brazil,BRA,2019.0,0.1791,37799,-10.333333,-53.2,210.0
7,18,Cambodia,KHM,2019.0,0.067508,1113,12.543322,104.814491,4.0
8,19,Cameroon,CMR,2019.0,0.41239,10671,4.612552,13.153581,2.0
9,20,Canada,CAN,2019.0,0.006362,238,61.066692,-107.991707,20.0


In [62]:
df_story = pd.concat([df_story, pd.DataFrame(res.unique(), columns = ["country"])]).sort_values(by=['country'])

In [63]:
df_story = df_story.set_index('country')
df_story.update(df_grouped)
df_story.reset_index(inplace=True)
df_story

Unnamed: 0,country,index,country_code,year,pw_capita,pw_total,lat,long,num_fake_news
0,Afghanistan,,,,,,,,28
1,Algeria,1.0,DZA,2019.0,0.134114,5774.0,28.000027,2.999983,25
2,Andorra,,,,,,,,1
3,Argentina,4.0,ARG,2019.0,0.092383,4137.0,-34.996496,-64.967282,82
4,Armenia,,,,,,,,6
5,Australia,5.0,AUS,2019.0,0.00127,32.0,-24.776109,134.755,94
6,Austria,,,,,,,,6
7,Bahrain,7.0,BHR,2019.0,0.0,0.0,26.155125,50.534461,3
8,Bangladesh,8.0,BGD,2019.0,0.151123,24640.0,24.476929,90.293441,17
9,Belarus,,,,,,,,7


In [64]:
df_story = df_story.drop(columns=['year', 'pw_capita', 'pw_total', 'index'])
df_story

Unnamed: 0,country,country_code,lat,long,num_fake_news
0,Afghanistan,,,,28
1,Algeria,DZA,28.000027,2.999983,25
2,Andorra,,,,1
3,Argentina,ARG,-34.996496,-64.967282,82
4,Armenia,,,,6
5,Australia,AUS,-24.776109,134.755,94
6,Austria,,,,6
7,Bahrain,BHR,26.155125,50.534461,3
8,Bangladesh,BGD,24.476929,90.293441,17
9,Belarus,,,,7


In [65]:
df_story[df_story.country_code.isnull()]

Unnamed: 0,country,country_code,lat,long,num_fake_news
0,Afghanistan,,,,28
2,Andorra,,,,1
4,Armenia,,,,6
6,Austria,,,,6
9,Belarus,,,,7
11,Bhutan,,,,6
12,Bolivia,,,,41
14,Burundi,,,,8
18,Central African Republic,,,,4
25,Cuba,,,,6


In [66]:
df_story.loc[df_story['country'] == 'Afghanistan','country_code'] = 'AFG'
df_story.loc[df_story['country'] == 'Afghanistan','lat'] = 33.9391
df_story.loc[df_story['country'] == 'Afghanistan','long'] = 67.7100

df_story.loc[df_story['country'] == 'Andorra','country_code'] = 'AND'
df_story.loc[df_story['country'] == 'Andorra','lat'] = 42.5063
df_story.loc[df_story['country'] == 'Andorra','long'] = 1.5218

df_story.loc[df_story['country'] == 'Armenia','country_code'] = 'ARM'
df_story.loc[df_story['country'] == 'Armenia','lat'] = 40.0691
df_story.loc[df_story['country'] == 'Armenia','long'] = 45.0382

df_story.loc[df_story['country'] == 'Austria','country_code'] = 'AUT'
df_story.loc[df_story['country'] == 'Austria','lat'] = 47.5162
df_story.loc[df_story['country'] == 'Austria','long'] = 14.5501

df_story.loc[df_story['country'] == 'Belarus','country_code'] = 'BLR'
df_story.loc[df_story['country'] == 'Belarus','lat'] = 53.7098
df_story.loc[df_story['country'] == 'Belarus','long'] = 27.9534

df_story.loc[df_story['country'] == 'Bhutan','country_code'] = 'BTN'
df_story.loc[df_story['country'] == 'Bhutan','lat'] = 27.5142
df_story.loc[df_story['country'] == 'Bhutan','long'] = 90.4336

df_story.loc[df_story['country'] == 'Bolivia','country_code'] = 'BOL'
df_story.loc[df_story['country'] == 'Bolivia','lat'] = 16.2902
df_story.loc[df_story['country'] == 'Bolivia','long'] = 63.5887

df_story.loc[df_story['country'] == 'Burundi','country_code'] = 'BDI'
df_story.loc[df_story['country'] == 'Burundi','lat'] = 3.3731
df_story.loc[df_story['country'] == 'Burundi','long'] = 29.9189

df_story.loc[df_story['country'] == 'Central African Republic','country_code'] = 'CAF'
df_story.loc[df_story['country'] == 'Central African Republic','lat'] = 6.6111
df_story.loc[df_story['country'] == 'Central African Republic','long'] = 20.9394

df_story.loc[df_story['country'] == 'Cuba','country_code'] = 'CUB'
df_story.loc[df_story['country'] == 'Cuba','lat'] = 21.5218
df_story.loc[df_story['country'] == 'Cuba','long'] = 77.7812

df_story.loc[df_story['country'] == 'Czech Republic','country_code'] = 'CZE'
df_story.loc[df_story['country'] == 'Czech Republic','lat'] = 49.8175
df_story.loc[df_story['country'] == 'Czech Republic','long'] = 15.4730

df_story.loc[df_story['country'] == 'Ethiopia','country_code'] = 'ETH'
df_story.loc[df_story['country'] == 'Ethiopia','lat'] = 9.1450
df_story.loc[df_story['country'] == 'Ethiopia','long'] = 40.4897

df_story.loc[df_story['country'] == 'Hungary','country_code'] = 'HUN'
df_story.loc[df_story['country'] == 'Hungary','lat'] = 47.1625
df_story.loc[df_story['country'] == 'Hungary','long'] = 19.5033

df_story.loc[df_story['country'] == 'Malawi','country_code'] = 'MWI'
df_story.loc[df_story['country'] == 'Malawi','lat'] = 13.2543
df_story.loc[df_story['country'] == 'Malawi','long'] = 34.3015

df_story.loc[df_story['country'] == 'Mali','country_code'] = 'MLI'
df_story.loc[df_story['country'] == 'Mali','lat'] = 17.5707
df_story.loc[df_story['country'] == 'Mali','long'] = 3.9962

df_story.loc[df_story['country'] == 'Moldova','country_code'] = 'MDA'
df_story.loc[df_story['country'] == 'Moldova','lat'] = 47.4116
df_story.loc[df_story['country'] == 'Moldova','long'] = 28.3699

df_story.loc[df_story['country'] == 'Nauru','country_code'] = 'NRU'
df_story.loc[df_story['country'] == 'Nauru','lat'] = 0.5228
df_story.loc[df_story['country'] == 'Nauru','long'] = 166.9315

df_story.loc[df_story['country'] == 'Nepal','country_code'] = 'NPL'
df_story.loc[df_story['country'] == 'Nepal','lat'] = 28.3949
df_story.loc[df_story['country'] == 'Nepal','long'] = 84.1240

df_story.loc[df_story['country'] == 'Paraguay','country_code'] = 'PRY'
df_story.loc[df_story['country'] == 'Paraguay','lat'] = 23.4425
df_story.loc[df_story['country'] == 'Paraguay','long'] = 58.4438

df_story.loc[df_story['country'] == 'Rwanda','country_code'] = 'RWA'
df_story.loc[df_story['country'] == 'Rwanda','lat'] = 1.9403
df_story.loc[df_story['country'] == 'Rwanda','long'] = 29.8739

df_story.loc[df_story['country'] == 'South Sudan','country_code'] = 'SSD'
df_story.loc[df_story['country'] == 'South Sudan','lat'] = 6.8770
df_story.loc[df_story['country'] == 'South Sudan','long'] = 31.3070

df_story.loc[df_story['country'] == 'Uganda','country_code'] = 'UGA'
df_story.loc[df_story['country'] == 'Uganda','lat'] = 1.3733
df_story.loc[df_story['country'] == 'Uganda','long'] = 32.2903

In [67]:
df_story

Unnamed: 0,country,country_code,lat,long,num_fake_news
0,Afghanistan,AFG,33.9391,67.71,28
1,Algeria,DZA,28.000027,2.999983,25
2,Andorra,AND,42.5063,1.5218,1
3,Argentina,ARG,-34.996496,-64.967282,82
4,Armenia,ARM,40.0691,45.0382,6
5,Australia,AUS,-24.776109,134.755,94
6,Austria,AUT,47.5162,14.5501,6
7,Bahrain,BHR,26.155125,50.534461,3
8,Bangladesh,BGD,24.476929,90.293441,17
9,Belarus,BLR,53.7098,27.9534,7


In [68]:
df_story.loc[df_story['country'] == 'Other']

Unnamed: 0,country,country_code,lat,long,num_fake_news
77,Other,,,,312


In [69]:
df.shape

(5613, 36)

#### So total # of fake news is 5,613 - 312 from the "Other" category

In [70]:
df_story.shape

(112, 5)

In [71]:
df_story = df_story[df_story.country != 'Other']

In [72]:
df_story.shape

(111, 5)

In [73]:
df_story

Unnamed: 0,country,country_code,lat,long,num_fake_news
0,Afghanistan,AFG,33.9391,67.71,28
1,Algeria,DZA,28.000027,2.999983,25
2,Andorra,AND,42.5063,1.5218,1
3,Argentina,ARG,-34.996496,-64.967282,82
4,Armenia,ARM,40.0691,45.0382,6
5,Australia,AUS,-24.776109,134.755,94
6,Austria,AUT,47.5162,14.5501,6
7,Bahrain,BHR,26.155125,50.534461,3
8,Bangladesh,BGD,24.476929,90.293441,17
9,Belarus,BLR,53.7098,27.9534,7


In [74]:
df_story.num_fake_news.sum()

5301

In [75]:
df_story.to_csv("fake_news_story.csv", index=False)