# Prepocessing GDELT Project

## Import libraries

In [1]:
!pip install validators
import validators

import numpy as np
import pandas as pd
pd.options.display.max_columns = 100

import requests
from bs4 import BeautifulSoup
import re
from multiprocessing import Pool
import pandas as pd

import pandas as pd
import numpy as np
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

import datetime
import time



## Scrapping

### URL : masterfilelist.txt

In [2]:
def masterfilelist(start_date, end_date):

    response = requests.get("http://data.gdeltproject.org/gdeltv2/masterfilelist.txt")
    content = response.content.decode("utf-8") 
    l = content.split('\n')[-1000:]

    liste = list()
    for i in l: liste.append(i.split(" ")[-1])

    df = pd.DataFrame(liste, columns=['url'])
    df['date_str'] = df['url'].apply(lambda x : x.split("/")[-1].split(".")[0][0:12])
    df = df.iloc[:df.shape[0]-1,:]
    df["date"] = pd.to_datetime(df["date_str"], format='%Y%m%d%H%M')

    start_datem = datetime.datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    end_datem = datetime.datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S")
    df = df.loc[(df['date'] >= start_datem) & (df['date'] <= end_datem)]
    
    df['type_csv'] = df['url'].apply(lambda x : x.lower().split(".csv")[0].split(".")[-1])

    df['id'] = df['date_str']+'_'+df['type_csv']
    
    df = df.drop(columns=['date'])
    
    return df

### URL : masterfilelist-translation.txt

In [3]:
def masterfilelist_translation(start_date, end_date):

    response = requests.get("http://data.gdeltproject.org/gdeltv2/masterfilelist-translation.txt")
    content = response.content.decode("utf-8") 
    l = content.split('\n')[-1000:]

    liste = list()
    for i in l: liste.append(i.split(" ")[-1])

    df = pd.DataFrame(liste, columns=['url_translation'])
    df['date_str_translation'] = df['url_translation'].apply(lambda x : x.split("/")[-1].split(".")[0][0:12])
    df = df.iloc[:df.shape[0]-1,:]
    df["date"] = pd.to_datetime(df["date_str_translation"], format='%Y%m%d%H%M')

    start_datem = datetime.datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    end_datem = datetime.datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S")
    df = df.loc[(df['date'] >= start_datem) & (df['date'] <= end_datem)]
    
    df['type_csv_translation'] = df['url_translation'].apply(lambda x : '_'.join(x.lower().split(".csv")[0].split(".")[-2:]))

    df['type_csv'] = df['url_translation'].apply(lambda x : x.lower().split(".csv")[0].split(".")[-1])

    df['id'] = df['date_str_translation']+'_'+df['type_csv']

    df = df.drop(columns=['type_csv', 'date'])
    
    return df

### Vérification de l'url

In [4]:
def verify_url(u):
    if validators.url(u) == True:
        return True
    else : 
        return False

### Merge masterfile.txt and masterfile_translation.txt

In [5]:
def merge_table(df, df_translation):
    
    # - left join des tableaux
    # - Première séléction : Supprimons les lignes où des NaN apparait
    # - Vérification URL
    
    result = df.merge(df_translation, on='id', how='left').dropna(axis='rows')
    result['work'] = result['url'].apply(lambda x : verify_url(x))
    result['work_translation'] = result['url_translation'].apply(lambda x : verify_url(x))
    return result

### Clean dataset

In [6]:
def clean_dataset(df):
    
    dk = df.groupby('date_str').count()[['id']]
    liste = dk[dk.id < 3].index.tolist()
    for item in liste:
        df = df.loc[result['date_str']!=item]

    liste = df.loc[df['work']==False]['date_str'].unique().tolist()
    for item in liste:
        df = df.loc[df['date_str']!=item]

    liste = df.loc[df['work_translation']==False]['date_str'].unique().tolist()
    for item in liste:
        df = df.loc[result['date_str']!=item]
            
    return df

### Fusion des tables

In [7]:
def concat_table(result):
    
    # Séparation des données de base et de translation ET concaténation 
    
    df_base = result[['url', 'type_csv']]

    df_translation = result[['url_translation', 'type_csv_translation']]

    df_translation = df_translation.rename(columns={'url_translation': "url", 'type_csv_translation': "type_csv"})

    final = pd.concat([df_base, df_translation])
    
    return final

### Lecture des zips

In [8]:
def read_zip(final):
    
    export               = final.loc[final['type_csv'] == 'export', 'url']
    mentions             = final.loc[final['type_csv'] == 'mentions', 'url']
    gkg                  = final.loc[final['type_csv'] == 'gkg', 'url']
    translation_export   = final.loc[final['type_csv'] == 'translation_export', 'url']
    translation_mentions = final.loc[final['type_csv'] == 'translation_mentions', 'url']
    translation_gkg      = final.loc[final['type_csv'] == 'translation_gkg', 'url']
    
    df_export               = list()
    df_mentions             = list()
    df_gkg                  = list()
    df_translation_export   = list()
    df_translation_mentions = list()
    df_translation_gkg      = list()
    
    for i in export.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_export.append(FFdata)
        
    for i in mentions.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_mentions.append(FFdata)
        
    for i in gkg.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_gkg.append(FFdata)
        
    for i in translation_export.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_translation_export.append(FFdata)
        
    for i in translation_mentions.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_translation_mentions.append(FFdata)
        
    for i in translation_gkg.tolist():
        url = urlopen(i) 
        k = i.split("/")[-1].split(".zip")[0]
        zipfile = ZipFile(BytesIO(url.read()))
        FFdata = pd.read_csv(zipfile.open(k), header=None,on_bad_lines='skip', sep="\t", engine='python', encoding = 'latin-1')
        df_translation_gkg.append(FFdata)
        
    export = pd.concat(df_export)
    mentions = pd.concat(df_mentions)
    gkg = pd.concat(df_gkg)

    export_translation = pd.concat(df_translation_export)
    mentions_translation = pd.concat(df_translation_mentions)
    gkg_translation = pd.concat(df_translation_gkg)
        
    return export, mentions, gkg, export_translation, mentions_translation, gkg_translation

### Execution des fonctions

In [9]:
start_time = time.time()

print("\n#### SCRAPPING... #####\n")

df = masterfilelist('2022-01-01 23:00:00', '2022-01-31 23:00:00')

df_translation = masterfilelist_translation('2022-01-01 23:00:00', '2022-01-31 23:00:00')

print("\n-------- %s seconde --------" % (time.time() - start_time))



print("\n\n#### MERGING... #####\n")

result = merge_table(df, df_translation)

print("\n-------- %s seconde --------" % (time.time() - start_time))


print("\n\n#### CLEANSING... #####\n")

result = clean_dataset(result)

print("\n-------- %s seconde --------" % (time.time() - start_time))



# print("\n\n#### SELECTING URL... #####\n")

# result = second_select(result)

# result = select_url_by_datetime(result, '2015-02-18 23:00:00', '2015-02-19 06:00:00')

# print("\n-------- %s seconde --------" % (time.time() - start_time))



print("\n\n#### CONCATENATING... #####\n")

final = concat_table(result)

print("\n-------- %s seconde --------" % (time.time() - start_time))


print("\n\n#### READING & EXPORTING TO CSV... #####\n")

export, mentions, gkg, translation_export, translation_mentions, translation_gkg = read_zip(final)

print("-------- %s seconde --------" % (time.time() - start_time))


#### SCRAPPING... #####


-------- 54.4085168838501 seconde --------


#### MERGING... #####


-------- 54.417428493499756 seconde --------


#### CLEANSING... #####


-------- 54.4204204082489 seconde --------


#### CONCATENATING... #####


-------- 54.422415018081665 seconde --------


#### READING & EXPORTING TO CSV... #####

-------- 387.2316553592682 seconde --------


### On visualise les dataframes

In [10]:
export

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,1025854683,20210131,202101,2021,2021.0849,,,,,,,,,,,PSE,WEST BANK,PSE,,,,,,,,1,40,40,4,1,1.0,4,1,4,1.094891,0,,,,,,,,4,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,4,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,20220131111500,https://www.middleeastmonitor.com/20220131-abb...
1,1025854684,20210131,202101,2021,2021.0849,,,,,,,,,,,PSE,WEST BANK,PSE,,,,,,,,1,40,40,4,1,1.0,2,1,2,1.094891,0,,,,,,,,4,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,1,Israel,IS,IS,,31.5000,34.750000,IS,20220131111500,https://www.middleeastmonitor.com/20220131-abb...
2,1025854685,20210131,202101,2021,2021.0849,,,,,,,,,,,PSE,WEST BANK,PSE,,,,,,,,1,46,46,4,1,7.0,4,1,4,1.094891,0,,,,,,,,4,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,4,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,20220131111500,https://www.middleeastmonitor.com/20220131-abb...
3,1025854686,20210131,202101,2021,2021.0849,,,,,,,,,,,REB,REBEL,,,,,,REB,,,0,190,190,19,4,-10.0,5,1,5,-10.447761,0,,,,,,,,4,"Kashmir, North-West Frontier, Pakistan",PK,PK03,40350,34.7960,72.150200,-2764806,4,"Kashmir, North-West Frontier, Pakistan",PK,PK03,40350,34.7960,72.150200,-2764806,20220131111500,https://www.aljazeera.com/news/2022/1/31/india...
4,1025854687,20210131,202101,2021,2021.0849,ARE,ABU DHABI,ARE,,,,,,,,GOV,GOVERNMENT,,,,,,GOV,,,1,10,10,1,1,0.0,4,1,4,-1.502146,4,"Abu Dhabi, Abu ZÂ¸aby, United Arab Emirates",AE,AE01,28568,24.4667,54.366700,-782066,4,"Abu Dhabi, Abu ZÂ¸aby, United Arab Emirates",AE,AE01,28568,24.4667,54.366700,-782066,4,"Abu Dhabi, Abu ZÂ¸aby, United Arab Emirates",AE,AE01,28568,24.4667,54.366700,-782066,20220131111500,https://economictimes.indiatimes.com/news/indi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,1025953611,20220131,202201,2022,2022.0849,YEMGOV,YEMENI,YEM,,,,,GOV,,,JOR,JORDANIAN,JOR,,,,,,,,1,51,51,5,1,3.4,12,1,12,3.781513,1,Yemen,YM,YM,,15.5000,47.500000,YM,4,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,1,Yemen,YM,YM,,15.5000,47.500000,YM,20220131230000,http://www.jordantimes.com/news/local/jordan-y...
998,1025953612,20220131,202201,2022,2022.0849,ran,RANA,,,ran,,,,,,JOR,JORDAN,JOR,,,,,,,,0,10,10,1,1,0.0,3,1,3,6.318083,4,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,4,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,4,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,20220131230000,http://jordantimes.com/news/local/kings-vision...
999,1025953613,20220131,202201,2022,2022.0849,znd,AZERI,,,znd,,,,,,BUS,COMPANIES,,,,,,BUS,,,1,193,193,19,4,-10.0,6,1,6,-4.592423,4,"Azeri, Ida-Virumaa, Estonia",EN,EN03,15925,59.4506,26.867500,-2621534,4,"Glasgow, Glasgow City, United Kingdom",UK,UKV2,40176,55.8333,-4.250000,-2597039,4,"Glasgow, Glasgow City, United Kingdom",UK,UKV2,40176,55.8333,-4.250000,-2597039,20220131230000,https://www.msn.com/en-xl/news/other/court-app...
1000,1025953614,20220131,202201,2022,2022.0849,znd,AZERI,,,znd,,,,,,EST,ESTONIA,EST,,,,,,,,1,192,192,19,4,-9.5,4,1,4,-4.592423,4,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,4,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,4,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,20220131230000,https://www.msn.com/en-xl/news/other/court-app...


In [11]:
mentions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1025854683,20220131111500,20220131111500,1,middleeastmonitor.com,https://www.middleeastmonitor.com/20220131-abb...,4,-1,488,429,0,40,1664,1.094891,,
1,1025854684,20220131111500,20220131111500,1,middleeastmonitor.com,https://www.middleeastmonitor.com/20220131-abb...,4,-1,503,429,0,20,1664,1.094891,,
2,1025854685,20220131111500,20220131111500,1,middleeastmonitor.com,https://www.middleeastmonitor.com/20220131-abb...,4,-1,503,471,0,40,1664,1.094891,,
3,1025854686,20220131111500,20220131111500,1,aljazeera.com,https://www.aljazeera.com/news/2022/1/31/india...,23,-1,4219,4231,1,50,4195,-10.447761,,
4,1025825622,20220131063000,20220131111500,1,bdnews24.com,https://bdnews24.com/neighbours/2022/01/31/ind...,14,-1,2128,2143,1,10,2103,-12.500000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3802,1025939176,20220131210000,20220131230000,1,whbl.com,https://whbl.com/2022/01/31/south-africa-scrap...,1,75,106,123,1,60,560,-6.382979,,
3803,1025953612,20220131230000,20220131230000,1,jordantimes.com,http://jordantimes.com/news/local/kings-vision...,10,1831,1877,1868,1,30,2907,6.318083,,
3804,1025953613,20220131230000,20220131230000,1,msn.com,https://www.msn.com/en-xl/news/other/court-app...,3,1191,1232,1226,1,60,5413,-4.592423,,
3805,1025953614,20220131230000,20220131230000,1,msn.com,https://www.msn.com/en-xl/news/other/court-app...,3,1191,1206,1150,0,40,5413,-4.592423,,


In [12]:
gkg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
0,20220131111500-0,20220131111500,1,pressandjournal.co.uk,https://www.pressandjournal.co.uk/fp/business/...,,,TAX_ETHNICITY;TAX_ETHNICITY_SCOTTISH;EPU_ECONO...,"TAX_DISEASE_INFECTION,2311;TAX_FNCACT_DIRECTOR...",1#United Kingdom#UK#UK#54#-4#UK,1#Scotland#UK#UK##54#-4#UK#105;1#Scotland#UK#U...,paul gordon;fraser sime,"Paul Gordon,1859;Paul Gordon,2139;Fraser Sime,...",scotland at bank;lloyds banking group;lloyds b...,"Scotland At Bank,931;Lloyds Banking Group,1397...","2.27848101265823,4.30379746835443,2.0253164556...",,"wc:366,c1.2:6,c12.1:20,c12.10:36,c12.12:13,c12...",https://www.pressandjournal.co.uk/wp-content/u...,,,,,"United Kingdom,623;Business Barometer,791;Unit...","12,months rose,356;200,businesses monthly,659;...",,<PAGE_LINKS>https://business.bankofscotland.co...
1,20220131111500-1,20220131111500,1,hancinema.net,https://www.hancinema.net/photo-new-still-adde...,,,TAX_ETHNICITY;TAX_ETHNICITY_KOREAN;TAX_WORLDLA...,"TAX_ETHNICITY_KOREAN,33;TAX_ETHNICITY_KOREAN,4...",,,lee seung-young,,korean national tax office,"Korean National Tax Office,451","-4.08163265306122,1.02040816326531,5.102040816...",,"wc:76,c12.1:5,c12.10:4,c12.12:1,c12.13:1,c12.1...",https://photos.hancinema.net/photos/photo14223...,,,,,"Son Hyun-joo,189;Park Yong-woo,208;Park Ji-il,...","16,episodes,204;",,<PAGE_LINKS>https://www.hancinema.net/korean_C...
2,20220131111500-2,20220131111500,1,dailystar.co.uk,https://www.dailystar.co.uk/showbiz/inside-dan...,,,RELIGION;,"RELIGION,1932;RELIGION,2176;","4#Manchester, Manchester, United Kingdom#UK#UK...","4#Manchester, Manchester, United Kingdom#UK#UK...",gary speed;dan walker;sarah walker,"Gary Speed,2664;Dan Walker,10;Dan Walker,1163;...",instagram,"Instagram,3535","0.720461095100864,2.88184438040346,2.161383285...",1#0#0#1999#176;1#0#0#2006#1290;1#0#0#2009#1399...,"wc:612,c1.2:2,c1.4:3,c12.1:37,c12.10:51,c12.12...",https://i2-prod.dailystar.co.uk/incoming/artic...,,,,2701|29||struggled to find the answers#3063|23...,"Dan Walker,11;Strictly Come,375;Nadiya Bychkov...","103,radio station,167;6,Nations,1142;5,Live un...",,<PAGE_LINKS>https://www.birminghammail.co.uk/n...
3,20220131111500-3,20220131111500,1,hltv.org,https://www.hltv.org/forums/threads/2578384/ma...,,,EDUCATION;SOC_POINTSOFINTEREST;SOC_POINTSOFINT...,"TAX_WORLDMAMMALS_ASS,39;EDUCATION,27;SOC_POINT...",,,,,,,"-3.84615384615385,0,3.84615384615385,3.8461538...",,"wc:24,c12.1:2,c12.10:1,c12.13:1,c12.3:1,c12.5:...",,,,https://youtube.com/user/wwwHLTVorg;,,,,,<PAGE_TITLE>Forum thread: Major reveal when?</...
4,20220131111500-4,20220131111500,1,armenpress.am,https://armenpress.am/eng/news/1074302/,,,TAX_FNCACT;TAX_FNCACT_ENGINEER;MANMADE_DISASTE...,"IDEOLOGY,1766;WB_1467_EDUCATION_FOR_ALL,2206;W...","4#Azeri, Vayots' Dzor, Armenia#AM#AM10#39.7195...","4#Azeri, Vayots' Dzor, Armenia#AM#AM10#4935#39...",gayane gaboyan;albert hovakimyan,"Gayane Gaboyan,2693;Albert Hovakimyan,419",young;ministry of education,"Young,28;Young,149;Young,296;Young,2541;Minist...","1.67064439140811,4.29594272076372,2.6252983293...",4#1#31#0#253,"wc:382,c12.1:36,c12.10:58,c12.12:22,c12.13:12,...",https://armenpress.am/static/news/b/2022/01/10...,,,https://youtube.com/channel/UCf2FTVdftbWx9Swxh...,,"Dragon Reaper,527;Dragon Reaper,681;Israeli-ma...","100,Ideas for Armenia,46;100,Ideas for Armenia...",,<PAGE_ALTURL_AMP>https://armenpress.am/eng/amp...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630,20220131230000-1630,20220131230000,1,palmbeachpost.com,https://www.palmbeachpost.com/story/news/2022/...,,,UNGP_FORESTS_RIVERS_OCEANS;EPU_CATS_MIGRATION_...,"TAX_ETHNICITY_INDIAN,5803;NATURAL_DISASTER_FRE...","3#Miami, Florida, United States#US#USFL#25.774...","3#Alachua, Florida, United States#US#USFL#FL00...",louise pearson;kathy hillard dimpflmaier;lisa ...,"Louise Pearson,2671;Kathy Hillard Dimpflmaier,...",instagram;twitter;wildlife conservation commis...,"Instagram,68;Instagram,2449;Instagram,6150;Ins...","-1.04712041884817,1.42109199700823,2.468212415...",,"wc:1203,c1.1:2,c1.3:1,c12.1:94,c12.10:139,c12....",https://www.gannett-cdn.com/presto/2022/01/31/...,,https://pic.twitter.com/uE4zDOGHuC;https://ins...,,3596|132||This is a frozen iguana. In Florida ...,"Palm Beach,247;Cold Florida,275;Palm Beach,519...","32,degrees,420;20,inches including tail,2340;3...",,<PAGE_LINKS>https://twitter.com/Drew_Morris/st...
1631,20220131230000-1631,20220131230000,1,cosmopolitan.com,https://www.cosmopolitan.com/style-beauty/beau...,,,TAX_DISEASE;TAX_DISEASE_CANCER;WB_1406_DISEASE...,"TAX_DISEASE_CANCER,165;WB_1406_DISEASES,165;WB...",,,,,,,"-1.43884892086331,2.87769784172662,4.316546762...",,"wc:116,c12.1:15,c12.10:20,c12.12:9,c12.13:5,c1...",https://hips.hearstapps.com/hmg-prod.s3.amazon...,,,https://youtube.com/c/cosmopolitan?sub_confirm...,,,"21,best face,519;",,<PAGE_LINKS>https://www.cosmopolitan.com/style...
1632,20220131230000-1632,20220131230000,1,yahoo.com,https://news.yahoo.com/utahs-ingles-mri-shows-...,,,EPU_CATS_REGULATION;WB_1921_PRIVATE_SECTOR_DEV...,"MANMADE_DISASTER_IMPLIED,2591;MANMADE_DISASTER...",,,tobias koppers;yehuda katz;stefan penner;tom d...,"Tobias Koppers,29561;Yehuda Katz,904;Stefan Pe...",yahoo,"Yahoo,6724;Yahoo,25516;Yahoo,25539;Yahoo,25709...","-1.14793155728828,0.389863547758285,1.53779510...",1#0#0#2014#907,"wc:4479,c1.1:3,c1.2:16,c12.1:112,c12.10:167,c1...",https://s.yimg.com/ny/api/res/1.2/aPaeELnptuV_...,,,,1711|202||===Object prototype toString call ( ...,"Tom Dale,1078;Stefan Penner,1097;Jake Archibal...","90,fireImageBeacon,8449;",,<PAGE_ALTURL_AMP>https://news.yahoo.com/amphtm...
1633,20220131230000-1633,20220131230000,1,iheart.com,https://my100fm.iheart.com/content/2022-01-31-...,,,NATURAL_DISASTER;NATURAL_DISASTER_HEAVY_SNOW;N...,WB_1458_HEALTH_PROMOTION_AND_DISEASE_PREVENTIO...,,,,,mansfield corporation,"Mansfield Corporation,87","-2.96296296296296,2.96296296296296,5.925925925...",,"wc:114,c12.1:7,c12.10:10,c12.12:2,c12.13:3,c12...",https://i.iheart.com/v3/re/assets.getty/60622e...,,https://instagram.com/p/CZPOVGLrQKj;https://pi...,https://youtube.com/watch?v=ZU6kXopIfqQ;https:...,,"Mansfield Residents,240;Mansfield Codified Ord...",,,<PAGE_PRECISEPUBTIMESTAMP>20220131182800</PAGE...


In [13]:
translation_export

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,1025857535,20210131,202101,2021,2021.0849,,,,,,,,,,,PRT,PORTUGAL,PRT,,,,,,,,0,57,57,5,1,8.0,10,1,10,-0.824742,0,,,,,,,,1,Portugal,PO,PO,,39.5000,-8.0000,PO,1,Portugal,PO,PO,,39.5000,-8.0000,PO,20220131111500,https://omirante.pt/sociedade/camara-do-entron...
1,1025857536,20210131,202101,2021,2021.0849,GOV,PRESIDENT,,,,,,GOV,,,,,,,,,,,,,0,40,40,4,1,1.0,6,1,6,-4.126547,1,Ireland,EI,EI,,53.0000,-8.0000,EI,0,,,,,,,,1,Ireland,EI,EI,,53.0000,-8.0000,EI,20220131111500,https://www.jornada.com.mx/2022/01/31/mundo/02...
2,1025857537,20210131,202101,2021,2021.0849,PRT,PORTUGAL,PRT,,,,,,,,,,,,,,,,,,0,57,57,5,1,8.0,10,1,10,-0.824742,1,Portugal,PO,PO,,39.5000,-8.0000,PO,0,,,,,,,,1,Portugal,PO,PO,,39.5000,-8.0000,PO,20220131111500,https://omirante.pt/sociedade/camara-do-entron...
3,1025857538,20220124,202201,2022,2022.0658,AUS,ADELAIDE,AUS,,,,,,,,TON,TONGA,TON,,,,,,,,1,80,80,8,2,5.0,10,1,10,-0.621118,4,"Hunga, VavaÂu, Tonga",TN,TN03,27407,-18.6833,-174.1330,-2778109,4,"Hunga, VavaÂu, Tonga",TN,TN03,27407,-18.6833,-174.1330,-2778109,4,"Hunga, VavaÂu, Tonga",TN,TN03,27407,-18.6833,-174.1330,-2778109,20220131111500,https://mundo.sputniknews.com/20220131/un-buqu...
4,1025857539,20220124,202201,2022,2022.0658,GOV,PRESIDENT,,,,,,GOV,,,EUR,EUROPE,EUR,,,,,,,,0,30,30,3,1,4.0,10,1,10,-1.061008,1,Ukraine,UP,UP,,49.0000,32.0000,UP,1,Ukraine,UP,UP,,49.0000,32.0000,UP,1,Ukraine,UP,UP,,49.0000,32.0000,UP,20220131111500,http://afn.by/news/i/294054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,1025954044,20220131,202201,2022,2022.0849,USAMED,ASSOCIATED PRESS,USA,,,,,MED,,,UKR,UKRAINIAN,UKR,,,,,,,,0,190,190,19,4,-10.0,8,1,8,-1.384615,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.4333,30.5167,-1044367,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.4333,30.5167,-1044367,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.4333,30.5167,-1044367,20220131230000,https://www.vesti.ru/article/2670780
429,1025954045,20220131,202201,2022,2022.0849,USAPRIGOV,PUERTO RICO,USA,,,,,GOV,,,,,,,,,,,,,1,51,51,5,1,3.4,10,1,10,-1.067616,1,Puerto Rico,RQ,RQ,,18.2359,-66.4838,RQ,0,,,,,,,,1,Puerto Rico,RQ,RQ,,18.2359,-66.4838,RQ,20220131230000,https://www.telemundopr.com/noticias/puerto-ri...
430,1025954046,20220131,202201,2022,2022.0849,VEN,VENEZUELA,VEN,,,,,,,,MIL,MILITARY,,,,,,MIL,,,1,36,36,3,1,4.0,4,1,4,2.242152,1,Venezuela,VE,VE,,8.0000,-66.0000,VE,1,Venezuela,VE,VE,,8.0000,-66.0000,VE,1,Venezuela,VE,VE,,8.0000,-66.0000,VE,20220131230000,http://www.laverdad.com/zulia/190687-abierta-c...
431,1025954047,20220131,202201,2022,2022.0849,VEN,VENEZUELAN,VEN,,,,,,,,OPP,POLITICAL PRISONER,,,,,,OPP,,,1,111,111,11,3,-2.0,10,1,10,-7.242340,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.0233,-64.5973,-938440,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.0233,-64.5973,-938440,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.0233,-64.5973,-938440,20220131230000,https://www.elnacional.com/venezuela/denuncian...


In [14]:
translation_mentions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1025857535,20220131111500,20220131111500,1,omirante.pt,https://omirante.pt/sociedade/camara-do-entron...,9,-1,2730,2678,1,100,3210,-0.824742,srclc:por;eng:GT-POR 1.0,
1,967178183,20210131081500,20220131111500,1,rtn.ch,https://www.rtn.ch/rtn/Actualite/economie/Le-m...,1,58,-1,94,1,100,2854,-0.214133,srclc:fra;eng:GT-FRA 1.0,
2,967212781,20210131150000,20220131111500,1,jornada.com.mx,https://www.jornada.com.mx/2022/01/31/mundo/02...,9,1814,-1,1843,1,60,4426,-4.126547,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
3,967143962,20210131000000,20220131111500,1,jornada.com.mx,https://www.jornada.com.mx/2022/01/31/mundo/02...,9,1814,-1,1843,0,40,4426,-4.126547,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
4,1025857536,20220131111500,20220131111500,1,jornada.com.mx,https://www.jornada.com.mx/2022/01/31/mundo/02...,9,1814,-1,1781,1,60,4426,-4.126547,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,1025804790,20220131011500,20220131230000,1,confirmado.com.ve,http://confirmado.com.ve/venezuela-recibe-un-m...,6,2067,2141,2098,0,20,2348,2.393617,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1117,1025804791,20220131011500,20220131230000,1,confirmado.com.ve,http://confirmado.com.ve/venezuela-recibe-un-m...,6,2067,2122,2078,1,60,2348,2.393617,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1118,1025954046,20220131230000,20220131230000,1,laverdad.com,http://www.laverdad.com/zulia/190687-abierta-c...,1,32,55,45,0,40,1368,2.242152,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1119,1025954047,20220131230000,20220131230000,1,elnacional.com,https://www.elnacional.com/venezuela/denuncian...,2,107,154,118,1,100,2200,-7.242340,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,


In [15]:
translation_gkg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
0,20220131111500-T0,2.022013e+13,1.0,kudyznudy.cz,https://www.kudyznudy.cz/akce/valentynsky-work...,,,TAX_FNCACT;TAX_FNCACT_WOMEN;TAX_ETHNICITY;TAX_...,"TAX_ECON_PRICE,545;TAX_ECON_PRICE,625;TAX_ETHN...","4#Prague, Praha, HlavnÃ­Esto, Czech Republic#E...","4#Prague, Praha, HlavnÃ­Esto, Czech Republic#E...",,,,,"-0.689655172413793,0.689655172413793,1.3793103...",,"wc:129,c12.1:5,c12.10:5,c12.12:1,c12.14:4,c12....",https://www.kudyznudy.cz/files/29/29deb825-bbb...,,,https://youtube.com/user/kudyznudycz?feature=r...,,,"141,of the,248;221,of The black Bridge,276;280...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_TITLE>Kudy z nudy - Valent&#xFD;nsk&#xFD...
1,20220131111500-T1,2.022013e+13,1.0,kudyznudy.cz,https://www.kudyznudy.cz/akce/moderni-umeni-po...,,,TAX_ETHNICITY;TAX_ETHNICITY_CZECH;TAX_WORLDLAN...,"TAX_ETHNICITY_CZECH,148;TAX_WORLDLANGUAGES_CZE...","4#Bohnice, StredoceskÃ½, Czech Republic#EZ#EZ8...","4#Bohnice, StredoceskÃ½, Czech Republic#EZ#EZ8...",,,,,"4.02010050251256,6.03015075376884,2.0100502512...",,"wc:185,c1.1:1,c1.3:2,c12.1:14,c12.10:11,c12.12...",https://www.kudyznudy.cz/files/e1/e14b5bef-9f4...,,,https://youtube.com/user/kudyznudycz?feature=r...,,"Her Barbara From,216;Beginnings Were,740",,srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_TITLE>Kudy z nudy - Modern&#xED; um&#x11...
2,20220131111500-T2,2.022013e+13,1.0,ceskenoviny.cz,https://www.ceskenoviny.cz/zpravy/studie-konsp...,,,TAX_FNCACT;TAX_FNCACT_SUPPORTERS;EXTREMISM;TAX...,"TAX_ETHNICITY_GERMAN,1366;TAX_WORLDLANGUAGES_G...",1#Germany#GM#GM#51.5#10.5#GM;1#Hungary#HU#HU#4...,1#Polish#PL#PL##52#20#PL#1426;1#Israeli#IS#IS#...,johannes gutenberga,"Johannes Gutenberga,581",research has,"Research Has,603;Research Has,2177","-0.915750915750916,2.38095238095238,3.29670329...",,"wc:523,c12.1:42,c12.10:65,c12.12:16,c12.13:26,...",https://i3.cn.cz/14/1535442500_P20180828025480...,,,https://youtube.com/user/CTKvideozpravy/feed;,,"Czech Republic,253;Universities Johannes Guten...","26,countries including the Czech,173;2,stages,...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>https://www.nature.com/articles/s4...
3,20220131111500-T3,2.022013e+13,1.0,ceskenoviny.cz,https://www.ceskenoviny.cz/zpravy/londyn-avizu...,,,SANCTIONS;TAX_WORLDLANGUAGES;TAX_WORLDLANGUAGE...,"EPU_POLICY_GOVERNMENT,1365;EPU_POLICY_GOVERNME...","4#City Of London, London, City Of, United King...",1#Russia#RS#RS##60#100#RS#168;1#Russia#RS#RS##...,susanne kass;simon clarke;vladimir putin;boris...,"Susanne Kass,3248;Simon Clarke,424;Vladimir Pu...",development has;agency reuters,"Development Has,2781;Agency Reuters,1506","-4.15094339622642,1.50943396226415,5.660377358...",,"wc:525,c12.1:30,c12.10:48,c12.11:1,c12.12:19,c...",https://i3.cn.cz/14/1490884966_P20170330083310...,,,https://youtube.com/user/CTKvideozpravy/feed;,,"Moscow Kingdom,137;New Russian,168;British Tre...","11,of the 2022,2580;",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>https://www.thetimes.co.uk/article...
4,20220131111500-T4,2.022013e+13,1.0,ceskenoviny.cz,https://www.ceskenoviny.cz/zpravy/v-karlovarsk...,,,MANMADE_DISASTER_IMPLIED;,"MANMADE_DISASTER_IMPLIED,924;MANMADE_DISASTER_...","1#Czech Republic#EZ#EZ#49.75#15#EZ;4#Bochov, K...","4#Prague, Praha, HlavnÃ­Esto, Czech Republic#E...",cheb transmotel,"Cheb Transmotel,1091",,,"-2.83768444948922,0.340522133938706,3.17820658...",,"wc:820,c1.3:2,c12.1:43,c12.10:68,c12.12:20,c12...",https://i3.cn.cz/14/1643619479_P2022013102743.jpg,https://i3.cn.cz/6/1642761615_P2022012104310.j...,,https://youtube.com/user/CTKvideozpravy/feed;,,"Prague Heavy,99;Problems May,189;Problems May,...","22,photo,264;22,photo,296;22,photo,328;22,the ...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>https://dopravniinfo.cz</PAGE_LINK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,20220131230000-T1408,2.022013e+13,1.0,vietgiaitri.com,https://vietgiaitri.com/smartphone-man-hinh-ga...,"AFFECT#2000000000##1#Vietnam, Republic Of#VM#V...","AFFECT#2000000000##1#Vietnam, Republic Of#VM#V...",TAX_ECON_PRICE;TAX_FNCACT;TAX_FNCACT_CHILDREN;...,"IDEOLOGY,5431;MEDIA_MSM,3219;TAX_ECON_PRICE,78...","1#Vietnam, Republic Of#VM#VM#16.166667#107.833...",1#Vietnam#VM#VM##16.166667#107.833333#VM#1169,,,,,"1.17860380779692,4.3517679057117,3.17316409791...",,"wc:1070,nwc:1280,c1.1:4,c1.3:2,c12.1:111,c12.1...",https://i.vietgiaitri.com/2022/2/1/smartphone-...,https://i.vietgiaitri.com/2022/1/28/cu-1-ngay-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Pictured Phone,459;Fresh New,2622;Apple New,28...","9,quality brought a cover,2124;1000000000,USD ...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/galaxy-z-f...
1409,20220131230000-T1409,2.022013e+13,1.0,vietgiaitri.com,https://vietgiaitri.com/the-gioi-da-ghi-nhan-t...,,,CRISISLEX_CRISISLEXREC;CRISISLEX_C03_WELLBEING...,"GENERAL_GOVERNMENT,2283;EPU_POLICY_GOVERNMENT,...","1#Vietnam, Republic Of#VM#VM#16.166667#107.833...",1#India#IN#IN##20#77#IN#1674;1#India#IN#IN##20...,europe europe;europe asia,"Europe Europe,822;Europe Europe,978;Europe Asi...",health india,"Health India,1892","-2.68817204301075,2.01612903225806,4.704301075...",,"wc:734,nwc:933,c1.3:1,c12.1:46,c12.10:106,c12....",https://t.vietgiaitri.com/2022/2/1/the-gioi-da...,https://i.vietgiaitri.com/2022/1/17/gioi-chuc-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"World Under,49;Pictured Staff,311;Hospital Pic...","1000000,World Under p statistics,32;1000000,th...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/benh-nhan-...
1410,20220131230000-T1410,2.022013e+13,1.0,vietgiaitri.com,https://vietgiaitri.com/thuong-lai-dam-nuoc-ma...,WOUND#2000##0######;CRISISLEX_CRISISLEXREC#200...,WOUND#2000##0#######0;CRISISLEX_CRISISLEXREC#2...,WOUND;CRISISLEX_CRISISLEXREC;CRISISLEX_C03_WEL...,"TAX_ECON_PRICE,511;TAX_ECON_PRICE,1077;TAX_ECO...",,,,,english campaign,"English Campaign,1859","1.59340659340659,6.37362637362637,4.7802197802...",,"wc:1799,nwc:1834,c1.1:4,c1.3:1,c12.1:192,c12.1...",https://i.vietgiaitri.com/2022/2/1/thuong-lai-...,https://i.vietgiaitri.com/2022/1/31/giao-thua-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Injured Drive,14;Flower New Year,67;Life Noon,...","2,English Campaign s traders,1558;1000000,copp...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/ban-hoa-ke...
1411,20220131230000-T1411,2.022013e+13,1.0,vietgiaitri.com,https://vietgiaitri.com/thu-tuong-nhanh-chong-...,,,TAX_FNCACT;TAX_FNCACT_MINISTER;LEADER;TAX_FNCA...,"TAX_FNCACT_GUIDE,1083;TAX_FNCACT_CHILD,1904;TA...","4#Hanoi, Ha N?I, Vietnam, Republic Of#VM#VM44#...",1#Reunion#RE#RE##-21.1#55.6#RE#2973;1#Reunion#...,,,,,"3.31534309946029,6.32228218966847,3.0069390902...",,"wc:1275,nwc:1255,c1.1:1,c1.2:3,c12.1:104,c12.1...",https://t.vietgiaitri.com/2022/2/1/thu-tuong-n...,https://i.vietgiaitri.com/2022/1/22/chu-tich-n...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Prime Minister Fast,20;New Year Life Here,88;P...","1000000,people learning work,3915;1000000000,U...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/bo-giao-th...


### Pre-processing base table

In [16]:
def rename_columns(export, mentions, gkg, translation_export, translation_mentions, translation_gkg):
    
    for i in range(export.shape[1]):
        export.rename({i: 'export_'+str(i)}, axis=1, inplace=True)
        
    for i in range(mentions.shape[1]):
        mentions.rename({i: 'mentions_'+str(i)}, axis=1, inplace=True)    
        
    for i in range(gkg.shape[1]):
        gkg.rename({i: 'gkg_'+str(i)}, axis=1, inplace=True)
        
    for i in range(translation_export.shape[1]):
        translation_export.rename({i: 'export_translation_'+str(i)}, axis=1, inplace=True)
        
    for i in range(translation_mentions.shape[1]):
        translation_mentions.rename({i: 'mentions_translation_'+str(i)}, axis=1, inplace=True)    
        
    for i in range(translation_gkg.shape[1]):
        translation_gkg.rename({i: 'gkg_translation_'+str(i)}, axis=1, inplace=True)            
    
    return export, mentions, gkg, translation_export, translation_mentions, translation_gkg

export, mentions, gkg, export_translation, mentions_translation, gkg_translation = rename_columns(export, mentions, gkg, translation_export, translation_mentions, translation_gkg)

In [17]:
export_translation

Unnamed: 0,export_translation_0,export_translation_1,export_translation_2,export_translation_3,export_translation_4,export_translation_5,export_translation_6,export_translation_7,export_translation_8,export_translation_9,export_translation_10,export_translation_11,export_translation_12,export_translation_13,export_translation_14,export_translation_15,export_translation_16,export_translation_17,export_translation_18,export_translation_19,export_translation_20,export_translation_21,export_translation_22,export_translation_23,export_translation_24,export_translation_25,export_translation_26,export_translation_27,export_translation_28,export_translation_29,export_translation_30,export_translation_31,export_translation_32,export_translation_33,export_translation_34,export_translation_35,export_translation_36,export_translation_37,export_translation_38,export_translation_39,export_translation_40,export_translation_41,export_translation_42,export_translation_43,export_translation_44,export_translation_45,export_translation_46,export_translation_47,export_translation_48,export_translation_49,export_translation_50,export_translation_51,export_translation_52,export_translation_53,export_translation_54,export_translation_55,export_translation_56,export_translation_57,export_translation_58,export_translation_59,export_translation_60
0,1025857535,20210131,202101,2021,2021.0849,,,,,,,,,,,PRT,PORTUGAL,PRT,,,,,,,,0,57,57,5,1,8.0,10,1,10,-0.824742,0,,,,,,,,1,Portugal,PO,PO,,39.5000,-8.0000,PO,1,Portugal,PO,PO,,39.5000,-8.0000,PO,20220131111500,https://omirante.pt/sociedade/camara-do-entron...
1,1025857536,20210131,202101,2021,2021.0849,GOV,PRESIDENT,,,,,,GOV,,,,,,,,,,,,,0,40,40,4,1,1.0,6,1,6,-4.126547,1,Ireland,EI,EI,,53.0000,-8.0000,EI,0,,,,,,,,1,Ireland,EI,EI,,53.0000,-8.0000,EI,20220131111500,https://www.jornada.com.mx/2022/01/31/mundo/02...
2,1025857537,20210131,202101,2021,2021.0849,PRT,PORTUGAL,PRT,,,,,,,,,,,,,,,,,,0,57,57,5,1,8.0,10,1,10,-0.824742,1,Portugal,PO,PO,,39.5000,-8.0000,PO,0,,,,,,,,1,Portugal,PO,PO,,39.5000,-8.0000,PO,20220131111500,https://omirante.pt/sociedade/camara-do-entron...
3,1025857538,20220124,202201,2022,2022.0658,AUS,ADELAIDE,AUS,,,,,,,,TON,TONGA,TON,,,,,,,,1,80,80,8,2,5.0,10,1,10,-0.621118,4,"Hunga, VavaÂu, Tonga",TN,TN03,27407,-18.6833,-174.1330,-2778109,4,"Hunga, VavaÂu, Tonga",TN,TN03,27407,-18.6833,-174.1330,-2778109,4,"Hunga, VavaÂu, Tonga",TN,TN03,27407,-18.6833,-174.1330,-2778109,20220131111500,https://mundo.sputniknews.com/20220131/un-buqu...
4,1025857539,20220124,202201,2022,2022.0658,GOV,PRESIDENT,,,,,,GOV,,,EUR,EUROPE,EUR,,,,,,,,0,30,30,3,1,4.0,10,1,10,-1.061008,1,Ukraine,UP,UP,,49.0000,32.0000,UP,1,Ukraine,UP,UP,,49.0000,32.0000,UP,1,Ukraine,UP,UP,,49.0000,32.0000,UP,20220131111500,http://afn.by/news/i/294054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,1025954044,20220131,202201,2022,2022.0849,USAMED,ASSOCIATED PRESS,USA,,,,,MED,,,UKR,UKRAINIAN,UKR,,,,,,,,0,190,190,19,4,-10.0,8,1,8,-1.384615,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.4333,30.5167,-1044367,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.4333,30.5167,-1044367,4,"Kiev, Ukraine (general), Ukraine",UP,UP00,28554,50.4333,30.5167,-1044367,20220131230000,https://www.vesti.ru/article/2670780
429,1025954045,20220131,202201,2022,2022.0849,USAPRIGOV,PUERTO RICO,USA,,,,,GOV,,,,,,,,,,,,,1,51,51,5,1,3.4,10,1,10,-1.067616,1,Puerto Rico,RQ,RQ,,18.2359,-66.4838,RQ,0,,,,,,,,1,Puerto Rico,RQ,RQ,,18.2359,-66.4838,RQ,20220131230000,https://www.telemundopr.com/noticias/puerto-ri...
430,1025954046,20220131,202201,2022,2022.0849,VEN,VENEZUELA,VEN,,,,,,,,MIL,MILITARY,,,,,,MIL,,,1,36,36,3,1,4.0,4,1,4,2.242152,1,Venezuela,VE,VE,,8.0000,-66.0000,VE,1,Venezuela,VE,VE,,8.0000,-66.0000,VE,1,Venezuela,VE,VE,,8.0000,-66.0000,VE,20220131230000,http://www.laverdad.com/zulia/190687-abierta-c...
431,1025954047,20220131,202201,2022,2022.0849,VEN,VENEZUELAN,VEN,,,,,,,,OPP,POLITICAL PRISONER,,,,,,OPP,,,1,111,111,11,3,-2.0,10,1,10,-7.242340,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.0233,-64.5973,-938440,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.0233,-64.5973,-938440,4,"Carabobo, AnzoÃ¡gui, Venezuela",VE,VE02,31872,10.0233,-64.5973,-938440,20220131230000,https://www.elnacional.com/venezuela/denuncian...


### Transform Date fiels into datetime 

In [18]:
def date_to_datime(export, mentions, gkg, export_translation, mentions_translation, gkg_translation):
    export["export_1"] = pd.to_datetime(export["export_1"], format='%Y%m%d')
    mentions["mentions_1"] = pd.to_datetime(mentions["mentions_1"], format='%Y%m%d%H%M%S')
    gkg["gkg_1"] = pd.to_datetime(gkg["gkg_1"], format='%Y%m%d%H%M%S')

    export_translation["export_translation_1"] = pd.to_datetime(export_translation["export_translation_1"], format='%Y%m%d')
    mentions_translation["mentions_translation_1"] = pd.to_datetime(mentions_translation["mentions_translation_1"], format='%Y%m%d%H%M%S')
    gkg_translation["gkg_translation_1"] = pd.to_datetime(gkg_translation["gkg_translation_1"], format='%Y%m%d%H%M%S')
    
    return export, mentions, gkg, export_translation, mentions_translation, gkg_translation

export, mentions, gkg, export_translation, mentions_translation, gkg_translation = date_to_datime(export, mentions, gkg, export_translation, mentions_translation, gkg_translation)

In [19]:
mentions_translation

Unnamed: 0,mentions_translation_0,mentions_translation_1,mentions_translation_2,mentions_translation_3,mentions_translation_4,mentions_translation_5,mentions_translation_6,mentions_translation_7,mentions_translation_8,mentions_translation_9,mentions_translation_10,mentions_translation_11,mentions_translation_12,mentions_translation_13,mentions_translation_14,mentions_translation_15
0,1025857535,2022-01-31 11:15:00,20220131111500,1,omirante.pt,https://omirante.pt/sociedade/camara-do-entron...,9,-1,2730,2678,1,100,3210,-0.824742,srclc:por;eng:GT-POR 1.0,
1,967178183,2021-01-31 08:15:00,20220131111500,1,rtn.ch,https://www.rtn.ch/rtn/Actualite/economie/Le-m...,1,58,-1,94,1,100,2854,-0.214133,srclc:fra;eng:GT-FRA 1.0,
2,967212781,2021-01-31 15:00:00,20220131111500,1,jornada.com.mx,https://www.jornada.com.mx/2022/01/31/mundo/02...,9,1814,-1,1843,1,60,4426,-4.126547,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
3,967143962,2021-01-31 00:00:00,20220131111500,1,jornada.com.mx,https://www.jornada.com.mx/2022/01/31/mundo/02...,9,1814,-1,1843,0,40,4426,-4.126547,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
4,1025857536,2022-01-31 11:15:00,20220131111500,1,jornada.com.mx,https://www.jornada.com.mx/2022/01/31/mundo/02...,9,1814,-1,1781,1,60,4426,-4.126547,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,1025804790,2022-01-31 01:15:00,20220131230000,1,confirmado.com.ve,http://confirmado.com.ve/venezuela-recibe-un-m...,6,2067,2141,2098,0,20,2348,2.393617,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1117,1025804791,2022-01-31 01:15:00,20220131230000,1,confirmado.com.ve,http://confirmado.com.ve/venezuela-recibe-un-m...,6,2067,2122,2078,1,60,2348,2.393617,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1118,1025954046,2022-01-31 23:00:00,20220131230000,1,laverdad.com,http://www.laverdad.com/zulia/190687-abierta-c...,1,32,55,45,0,40,1368,2.242152,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
1119,1025954047,2022-01-31 23:00:00,20220131230000,1,elnacional.com,https://www.elnacional.com/venezuela/denuncian...,2,107,154,118,1,100,2200,-7.242340,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,


### Merge Table

mentions_translation + mentions + export

In [20]:
def merge_table(export, mentions, mentions_translation):
    sub_mentions_translation = mentions_translation.loc[:,["mentions_translation_0", "mentions_translation_14"]]
    sub_mentions_translation["mentions_translation_14"] = sub_mentions_translation["mentions_translation_14"].apply(lambda x: x.split(";")[0].split(":")[1])
    
    mentions_mentions_translation = mentions.merge(sub_mentions_translation, left_on='mentions_0', right_on='mentions_translation_0', how='left')
    
    export_mentions_mentions_translation_joined = mentions_mentions_translation.merge(export, left_on="mentions_0", right_on="export_0", how='left')

    return export_mentions_mentions_translation_joined

export_mentions_mentions_translation_joined = merge_table(export, mentions, mentions_translation) 

export_mentions_mentions_translation_joined

Unnamed: 0,mentions_0,mentions_1,mentions_2,mentions_3,mentions_4,mentions_5,mentions_6,mentions_7,mentions_8,mentions_9,mentions_10,mentions_11,mentions_12,mentions_13,mentions_14,mentions_15,mentions_translation_0,mentions_translation_14,export_0,export_1,export_2,export_3,export_4,export_5,export_6,export_7,export_8,export_9,export_10,export_11,export_12,export_13,export_14,export_15,export_16,export_17,export_18,export_19,export_20,export_21,export_22,export_23,export_24,export_25,export_26,export_27,export_28,export_29,export_30,export_31,export_32,export_33,export_34,export_35,export_36,export_37,export_38,export_39,export_40,export_41,export_42,export_43,export_44,export_45,export_46,export_47,export_48,export_49,export_50,export_51,export_52,export_53,export_54,export_55,export_56,export_57,export_58,export_59,export_60
0,1025854683,2022-01-31 11:15:00,20220131111500,1,middleeastmonitor.com,https://www.middleeastmonitor.com/20220131-abb...,4,-1,488,429,0,40,1664,1.094891,,,,,1.025855e+09,2021-01-31,202101.0,2021.0,2021.0849,,,,,,,,,,,PSE,WEST BANK,PSE,,,,,,,,1.0,40.0,40.0,4.0,1.0,1.0,4.0,1.0,4.0,1.094891,0.0,,,,,,,,4.0,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,4.0,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,2.022013e+13,https://www.middleeastmonitor.com/20220131-abb...
1,1025854684,2022-01-31 11:15:00,20220131111500,1,middleeastmonitor.com,https://www.middleeastmonitor.com/20220131-abb...,4,-1,503,429,0,20,1664,1.094891,,,,,1.025855e+09,2021-01-31,202101.0,2021.0,2021.0849,,,,,,,,,,,PSE,WEST BANK,PSE,,,,,,,,1.0,40.0,40.0,4.0,1.0,1.0,2.0,1.0,2.0,1.094891,0.0,,,,,,,,4.0,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,1.0,Israel,IS,IS,,31.5000,34.750000,IS,2.022013e+13,https://www.middleeastmonitor.com/20220131-abb...
2,1025854685,2022-01-31 11:15:00,20220131111500,1,middleeastmonitor.com,https://www.middleeastmonitor.com/20220131-abb...,4,-1,503,471,0,40,1664,1.094891,,,,,1.025855e+09,2021-01-31,202101.0,2021.0,2021.0849,,,,,,,,,,,PSE,WEST BANK,PSE,,,,,,,,1.0,46.0,46.0,4.0,1.0,7.0,4.0,1.0,4.0,1.094891,0.0,,,,,,,,4.0,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,4.0,"Ramallah, West Bank (general), West Bank",WE,WE00,68767,31.9026,35.195500,-795956,2.022013e+13,https://www.middleeastmonitor.com/20220131-abb...
3,1025854686,2022-01-31 11:15:00,20220131111500,1,aljazeera.com,https://www.aljazeera.com/news/2022/1/31/india...,23,-1,4219,4231,1,50,4195,-10.447761,,,,,1.025855e+09,2021-01-31,202101.0,2021.0,2021.0849,,,,,,,,,,,REB,REBEL,,,,,,REB,,,0.0,190.0,190.0,19.0,4.0,-10.0,5.0,1.0,5.0,-10.447761,0.0,,,,,,,,4.0,"Kashmir, North-West Frontier, Pakistan",PK,PK03,40350,34.7960,72.150200,-2764806,4.0,"Kashmir, North-West Frontier, Pakistan",PK,PK03,40350,34.7960,72.150200,-2764806,2.022013e+13,https://www.aljazeera.com/news/2022/1/31/india...
4,1025825622,2022-01-31 06:30:00,20220131111500,1,bdnews24.com,https://bdnews24.com/neighbours/2022/01/31/ind...,14,-1,2128,2143,1,10,2103,-12.500000,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471195,1025939176,2022-01-31 21:00:00,20220131230000,1,whbl.com,https://whbl.com/2022/01/31/south-africa-scrap...,1,75,106,123,1,60,560,-6.382979,,,,,1.025939e+09,2022-01-31,202201.0,2022.0,2022.0849,ZAF,JOHANNESBURG,ZAF,,,,,,,,AFR,AFRICA,AFR,,,,,,,,1.0,20.0,20.0,2.0,1.0,3.0,6.0,1.0,6.0,-6.382979,4.0,"Johannesburg, Gauteng, South Africa",SF,SF06,77364,-26.2000,28.083300,-1240261,4.0,"Johannesburg, Gauteng, South Africa",SF,SF06,77364,-26.2000,28.083300,-1240261,4.0,"Johannesburg, Gauteng, South Africa",SF,SF06,77364,-26.2000,28.083300,-1240261,2.022013e+13,https://wdsm710.com/2022/01/31/south-africa-sc...
471196,1025953612,2022-01-31 23:00:00,20220131230000,1,jordantimes.com,http://jordantimes.com/news/local/kings-vision...,10,1831,1877,1868,1,30,2907,6.318083,,,,,1.025954e+09,2022-01-31,202201.0,2022.0,2022.0849,ran,RANA,,,ran,,,,,,JOR,JORDAN,JOR,,,,,,,,0.0,10.0,10.0,1.0,1.0,0.0,3.0,1.0,3.0,6.318083,4.0,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,4.0,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,4.0,"Amman, (JO11), Jordan",JO,JO11,36728,31.9500,35.933300,-970362,2.022013e+13,http://jordantimes.com/news/local/kings-vision...
471197,1025953613,2022-01-31 23:00:00,20220131230000,1,msn.com,https://www.msn.com/en-xl/news/other/court-app...,3,1191,1232,1226,1,60,5413,-4.592423,,,,,1.025954e+09,2022-01-31,202201.0,2022.0,2022.0849,znd,AZERI,,,znd,,,,,,BUS,COMPANIES,,,,,,BUS,,,1.0,193.0,193.0,19.0,4.0,-10.0,6.0,1.0,6.0,-4.592423,4.0,"Azeri, Ida-Virumaa, Estonia",EN,EN03,15925,59.4506,26.867500,-2621534,4.0,"Glasgow, Glasgow City, United Kingdom",UK,UKV2,40176,55.8333,-4.250000,-2597039,4.0,"Glasgow, Glasgow City, United Kingdom",UK,UKV2,40176,55.8333,-4.250000,-2597039,2.022013e+13,https://www.msn.com/en-xl/news/other/court-app...
471198,1025953614,2022-01-31 23:00:00,20220131230000,1,msn.com,https://www.msn.com/en-xl/news/other/court-app...,3,1191,1206,1150,0,40,5413,-4.592423,,,,,1.025954e+09,2022-01-31,202201.0,2022.0,2022.0849,znd,AZERI,,,znd,,,,,,EST,ESTONIA,EST,,,,,,,,1.0,192.0,192.0,19.0,4.0,-9.5,4.0,1.0,4.0,-4.592423,4.0,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,4.0,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,4.0,"London, London, City of, United Kingdom",UK,UKH9,40110,51.5000,-0.116667,-2601889,2.022013e+13,https://www.msn.com/en-xl/news/other/court-app...


In [21]:
export_mentions_mentions_translation_joined["mentions_translation_14"].isnull().sum()

146680

## Requête 1 

In [22]:
def requete_1(export_mentions_mentions_translation_joined):
    requete1 = export_mentions_mentions_translation_joined.loc[:,["mentions_0", "mentions_1", "export_53", "mentions_translation_14"]]
    requete1['day'] = requete1["mentions_1"].dt.day
    requete1['month'] = requete1["mentions_1"].dt.month
    requete1['year'] = requete1["mentions_1"].dt.year

    requete1.rename(columns={"mentions_0" : "id_event",
                            "mentions_1" : "datetime",
                            "export_53" : "country_code",
                            "mentions_translation_14" : "source_langue"}, inplace=True)
    
    # Drop all rows with full NaN values
    col = requete1.columns.tolist()
    requete1 = requete1.dropna(subset=col, how='all')
    
    requete1.to_csv(r'C:/HUGO/Ecole/Telecom Paris/COURS/INF_728_Base_de_donnees_non_relationnelles/GDELT Project/requete1.csv', index=False)
    
    return requete1

requete1 = requete_1(export_mentions_mentions_translation_joined)

In [23]:
requete1

Unnamed: 0,id_event,datetime,country_code,source_langue,day,month,year
0,1025854683,2022-01-31 11:15:00,WE,,31,1,2022
1,1025854684,2022-01-31 11:15:00,IS,,31,1,2022
2,1025854685,2022-01-31 11:15:00,WE,,31,1,2022
3,1025854686,2022-01-31 11:15:00,PK,,31,1,2022
4,1025825622,2022-01-31 06:30:00,,,31,1,2022
...,...,...,...,...,...,...,...
471195,1025939176,2022-01-31 21:00:00,SF,,31,1,2022
471196,1025953612,2022-01-31 23:00:00,JO,,31,1,2022
471197,1025953613,2022-01-31 23:00:00,UK,,31,1,2022
471198,1025953614,2022-01-31 23:00:00,UK,,31,1,2022


In [24]:
pd.DataFrame(requete1["source_langue"].value_counts())

Unnamed: 0,source_langue
spa,72676
rus,45884
fra,26563
deu,25864
por,20201
bul,18760
ita,15744
ell,11705
ara,11383
ron,8562


In [25]:
pd.DataFrame(requete1["country_code"].value_counts())

Unnamed: 0,country_code
US,40633
UK,13952
RS,9115
UP,5657
CA,4074
...,...
GK,1
TO,1
AV,1
RN,1


## Requête 2

In [26]:
def requete_2(export): 
    
    requete2 = export.loc[:,["export_0", "export_1", "export_53", "export_26"]]
    requete2['day'] = requete2["export_1"].dt.day
    requete2['month'] = requete2["export_1"].dt.month
    requete2['year'] = requete2["export_1"].dt.year

    requete2.rename(columns={"export_0" : "id_event",
                            "export_1" : "datetime",
                            "export_53" : "country_code",
                            "export_26" : "event_code"}, inplace=True)
    
    # Drop all rows with full NaN values
    col = requete2.columns.tolist()
    requete2 = requete2.dropna(subset=col, how='all')
    
    requete2.to_csv(r'C:/HUGO/Ecole/Telecom Paris/COURS/INF_728_Base_de_donnees_non_relationnelles/GDELT Project/requete2.csv', index=False)
    
    return requete2

In [27]:
requete_2 = requete_2(export)

In [28]:
requete_2

Unnamed: 0,id_event,datetime,country_code,event_code,day,month,year
0,1025854683,2021-01-31,WE,40,31,1,2021
1,1025854684,2021-01-31,IS,40,31,1,2021
2,1025854685,2021-01-31,WE,46,31,1,2021
3,1025854686,2021-01-31,PK,190,31,1,2021
4,1025854687,2021-01-31,AE,10,31,1,2021
...,...,...,...,...,...,...,...
997,1025953611,2022-01-31,YM,51,31,1,2022
998,1025953612,2022-01-31,JO,10,31,1,2022
999,1025953613,2022-01-31,UK,193,31,1,2022
1000,1025953614,2022-01-31,UK,192,31,1,2022


## Requête 3

In [29]:
def requete_3(gkg):    
    requete3 = gkg.loc[:,["gkg_0","gkg_1", "gkg_3", "gkg_7", "gkg_11", "gkg_9", "gkg_15"]]

    requete3['day'] = requete3["gkg_1"].dt.day
    requete3['month'] = requete3["gkg_1"].dt.month
    requete3['year'] = requete3["gkg_1"].dt.year

    requete3.rename(columns={"gkg_0" : "id_gkg",
                             "gkg_1" : "datetime",
                            "gkg_3" : "source_domain",
                            "gkg_7" : "themes",
                            "gkg_11" : "persons", 
                            "gkg_9" : "locations",
                            "gkg_15" : "avg_tone"}, inplace=True)


    requete3["locations"] = requete3["locations"].apply(lambda x : str(x).split(",")[0].split("#")[-1])
    requete3["avg_tone"] = requete3["avg_tone"].apply(lambda x : float(str(x).split(",")[0]))
    
    # Drop all rows with full NaN values
    col = requete3.columns.tolist()
    requete3 = requete3.dropna(subset=col, how='all')

    requete3.to_csv(r'C:/HUGO/Ecole/Telecom Paris/COURS/INF_728_Base_de_donnees_non_relationnelles/GDELT Project/requete3.csv', index=False)

    return requete3

In [30]:
requete3 = requete_3(gkg)

In [31]:
requete3

Unnamed: 0,id_gkg,datetime,source_domain,themes,persons,locations,avg_tone,day,month,year
0,20220131111500-0,2022-01-31 11:15:00,pressandjournal.co.uk,TAX_ETHNICITY;TAX_ETHNICITY_SCOTTISH;EPU_ECONO...,paul gordon;fraser sime,UK,2.278481,31,1,2022
1,20220131111500-1,2022-01-31 11:15:00,hancinema.net,TAX_ETHNICITY;TAX_ETHNICITY_KOREAN;TAX_WORLDLA...,lee seung-young,,-4.081633,31,1,2022
2,20220131111500-2,2022-01-31 11:15:00,dailystar.co.uk,RELIGION;,gary speed;dan walker;sarah walker,Manchester,0.720461,31,1,2022
3,20220131111500-3,2022-01-31 11:15:00,hltv.org,EDUCATION;SOC_POINTSOFINTEREST;SOC_POINTSOFINT...,,,-3.846154,31,1,2022
4,20220131111500-4,2022-01-31 11:15:00,armenpress.am,TAX_FNCACT;TAX_FNCACT_ENGINEER;MANMADE_DISASTE...,gayane gaboyan;albert hovakimyan,Azeri,1.670644,31,1,2022
...,...,...,...,...,...,...,...,...,...,...
1630,20220131230000-1630,2022-01-31 23:00:00,palmbeachpost.com,UNGP_FORESTS_RIVERS_OCEANS;EPU_CATS_MIGRATION_...,louise pearson;kathy hillard dimpflmaier;lisa ...,Miami,-1.047120,31,1,2022
1631,20220131230000-1631,2022-01-31 23:00:00,cosmopolitan.com,TAX_DISEASE;TAX_DISEASE_CANCER;WB_1406_DISEASE...,,,-1.438849,31,1,2022
1632,20220131230000-1632,2022-01-31 23:00:00,yahoo.com,EPU_CATS_REGULATION;WB_1921_PRIVATE_SECTOR_DEV...,tobias koppers;yehuda katz;stefan penner;tom d...,,-1.147932,31,1,2022
1633,20220131230000-1633,2022-01-31 23:00:00,iheart.com,NATURAL_DISASTER;NATURAL_DISASTER_HEAVY_SNOW;N...,,,-2.962963,31,1,2022


In [32]:
gkg_translation

Unnamed: 0,gkg_translation_0,gkg_translation_1,gkg_translation_2,gkg_translation_3,gkg_translation_4,gkg_translation_5,gkg_translation_6,gkg_translation_7,gkg_translation_8,gkg_translation_9,gkg_translation_10,gkg_translation_11,gkg_translation_12,gkg_translation_13,gkg_translation_14,gkg_translation_15,gkg_translation_16,gkg_translation_17,gkg_translation_18,gkg_translation_19,gkg_translation_20,gkg_translation_21,gkg_translation_22,gkg_translation_23,gkg_translation_24,gkg_translation_25,gkg_translation_26
0,20220131111500-T0,2022-01-31 11:15:00,1.0,kudyznudy.cz,https://www.kudyznudy.cz/akce/valentynsky-work...,,,TAX_FNCACT;TAX_FNCACT_WOMEN;TAX_ETHNICITY;TAX_...,"TAX_ECON_PRICE,545;TAX_ECON_PRICE,625;TAX_ETHN...","4#Prague, Praha, HlavnÃ­Esto, Czech Republic#E...","4#Prague, Praha, HlavnÃ­Esto, Czech Republic#E...",,,,,"-0.689655172413793,0.689655172413793,1.3793103...",,"wc:129,c12.1:5,c12.10:5,c12.12:1,c12.14:4,c12....",https://www.kudyznudy.cz/files/29/29deb825-bbb...,,,https://youtube.com/user/kudyznudycz?feature=r...,,,"141,of the,248;221,of The black Bridge,276;280...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_TITLE>Kudy z nudy - Valent&#xFD;nsk&#xFD...
1,20220131111500-T1,2022-01-31 11:15:00,1.0,kudyznudy.cz,https://www.kudyznudy.cz/akce/moderni-umeni-po...,,,TAX_ETHNICITY;TAX_ETHNICITY_CZECH;TAX_WORLDLAN...,"TAX_ETHNICITY_CZECH,148;TAX_WORLDLANGUAGES_CZE...","4#Bohnice, StredoceskÃ½, Czech Republic#EZ#EZ8...","4#Bohnice, StredoceskÃ½, Czech Republic#EZ#EZ8...",,,,,"4.02010050251256,6.03015075376884,2.0100502512...",,"wc:185,c1.1:1,c1.3:2,c12.1:14,c12.10:11,c12.12...",https://www.kudyznudy.cz/files/e1/e14b5bef-9f4...,,,https://youtube.com/user/kudyznudycz?feature=r...,,"Her Barbara From,216;Beginnings Were,740",,srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_TITLE>Kudy z nudy - Modern&#xED; um&#x11...
2,20220131111500-T2,2022-01-31 11:15:00,1.0,ceskenoviny.cz,https://www.ceskenoviny.cz/zpravy/studie-konsp...,,,TAX_FNCACT;TAX_FNCACT_SUPPORTERS;EXTREMISM;TAX...,"TAX_ETHNICITY_GERMAN,1366;TAX_WORLDLANGUAGES_G...",1#Germany#GM#GM#51.5#10.5#GM;1#Hungary#HU#HU#4...,1#Polish#PL#PL##52#20#PL#1426;1#Israeli#IS#IS#...,johannes gutenberga,"Johannes Gutenberga,581",research has,"Research Has,603;Research Has,2177","-0.915750915750916,2.38095238095238,3.29670329...",,"wc:523,c12.1:42,c12.10:65,c12.12:16,c12.13:26,...",https://i3.cn.cz/14/1535442500_P20180828025480...,,,https://youtube.com/user/CTKvideozpravy/feed;,,"Czech Republic,253;Universities Johannes Guten...","26,countries including the Czech,173;2,stages,...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>https://www.nature.com/articles/s4...
3,20220131111500-T3,2022-01-31 11:15:00,1.0,ceskenoviny.cz,https://www.ceskenoviny.cz/zpravy/londyn-avizu...,,,SANCTIONS;TAX_WORLDLANGUAGES;TAX_WORLDLANGUAGE...,"EPU_POLICY_GOVERNMENT,1365;EPU_POLICY_GOVERNME...","4#City Of London, London, City Of, United King...",1#Russia#RS#RS##60#100#RS#168;1#Russia#RS#RS##...,susanne kass;simon clarke;vladimir putin;boris...,"Susanne Kass,3248;Simon Clarke,424;Vladimir Pu...",development has;agency reuters,"Development Has,2781;Agency Reuters,1506","-4.15094339622642,1.50943396226415,5.660377358...",,"wc:525,c12.1:30,c12.10:48,c12.11:1,c12.12:19,c...",https://i3.cn.cz/14/1490884966_P20170330083310...,,,https://youtube.com/user/CTKvideozpravy/feed;,,"Moscow Kingdom,137;New Russian,168;British Tre...","11,of the 2022,2580;",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>https://www.thetimes.co.uk/article...
4,20220131111500-T4,2022-01-31 11:15:00,1.0,ceskenoviny.cz,https://www.ceskenoviny.cz/zpravy/v-karlovarsk...,,,MANMADE_DISASTER_IMPLIED;,"MANMADE_DISASTER_IMPLIED,924;MANMADE_DISASTER_...","1#Czech Republic#EZ#EZ#49.75#15#EZ;4#Bochov, K...","4#Prague, Praha, HlavnÃ­Esto, Czech Republic#E...",cheb transmotel,"Cheb Transmotel,1091",,,"-2.83768444948922,0.340522133938706,3.17820658...",,"wc:820,c1.3:2,c12.1:43,c12.10:68,c12.12:20,c12...",https://i3.cn.cz/14/1643619479_P2022013102743.jpg,https://i3.cn.cz/6/1642761615_P2022012104310.j...,,https://youtube.com/user/CTKvideozpravy/feed;,,"Prague Heavy,99;Problems May,189;Problems May,...","22,photo,264;22,photo,296;22,photo,328;22,the ...",srclc:ces;eng:Moses 2.1.1 / MosesCore Europarl...,<PAGE_LINKS>https://dopravniinfo.cz</PAGE_LINK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,20220131230000-T1408,2022-01-31 23:00:00,1.0,vietgiaitri.com,https://vietgiaitri.com/smartphone-man-hinh-ga...,"AFFECT#2000000000##1#Vietnam, Republic Of#VM#V...","AFFECT#2000000000##1#Vietnam, Republic Of#VM#V...",TAX_ECON_PRICE;TAX_FNCACT;TAX_FNCACT_CHILDREN;...,"IDEOLOGY,5431;MEDIA_MSM,3219;TAX_ECON_PRICE,78...","1#Vietnam, Republic Of#VM#VM#16.166667#107.833...",1#Vietnam#VM#VM##16.166667#107.833333#VM#1169,,,,,"1.17860380779692,4.3517679057117,3.17316409791...",,"wc:1070,nwc:1280,c1.1:4,c1.3:2,c12.1:111,c12.1...",https://i.vietgiaitri.com/2022/2/1/smartphone-...,https://i.vietgiaitri.com/2022/1/28/cu-1-ngay-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Pictured Phone,459;Fresh New,2622;Apple New,28...","9,quality brought a cover,2124;1000000000,USD ...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/galaxy-z-f...
1409,20220131230000-T1409,2022-01-31 23:00:00,1.0,vietgiaitri.com,https://vietgiaitri.com/the-gioi-da-ghi-nhan-t...,,,CRISISLEX_CRISISLEXREC;CRISISLEX_C03_WELLBEING...,"GENERAL_GOVERNMENT,2283;EPU_POLICY_GOVERNMENT,...","1#Vietnam, Republic Of#VM#VM#16.166667#107.833...",1#India#IN#IN##20#77#IN#1674;1#India#IN#IN##20...,europe europe;europe asia,"Europe Europe,822;Europe Europe,978;Europe Asi...",health india,"Health India,1892","-2.68817204301075,2.01612903225806,4.704301075...",,"wc:734,nwc:933,c1.3:1,c12.1:46,c12.10:106,c12....",https://t.vietgiaitri.com/2022/2/1/the-gioi-da...,https://i.vietgiaitri.com/2022/1/17/gioi-chuc-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"World Under,49;Pictured Staff,311;Hospital Pic...","1000000,World Under p statistics,32;1000000,th...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/benh-nhan-...
1410,20220131230000-T1410,2022-01-31 23:00:00,1.0,vietgiaitri.com,https://vietgiaitri.com/thuong-lai-dam-nuoc-ma...,WOUND#2000##0######;CRISISLEX_CRISISLEXREC#200...,WOUND#2000##0#######0;CRISISLEX_CRISISLEXREC#2...,WOUND;CRISISLEX_CRISISLEXREC;CRISISLEX_C03_WEL...,"TAX_ECON_PRICE,511;TAX_ECON_PRICE,1077;TAX_ECO...",,,,,english campaign,"English Campaign,1859","1.59340659340659,6.37362637362637,4.7802197802...",,"wc:1799,nwc:1834,c1.1:4,c1.3:1,c12.1:192,c12.1...",https://i.vietgiaitri.com/2022/2/1/thuong-lai-...,https://i.vietgiaitri.com/2022/1/31/giao-thua-...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Injured Drive,14;Flower New Year,67;Life Noon,...","2,English Campaign s traders,1558;1000000,copp...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/ban-hoa-ke...
1411,20220131230000-T1411,2022-01-31 23:00:00,1.0,vietgiaitri.com,https://vietgiaitri.com/thu-tuong-nhanh-chong-...,,,TAX_FNCACT;TAX_FNCACT_MINISTER;LEADER;TAX_FNCA...,"TAX_FNCACT_GUIDE,1083;TAX_FNCACT_CHILD,1904;TA...","4#Hanoi, Ha N?I, Vietnam, Republic Of#VM#VM44#...",1#Reunion#RE#RE##-21.1#55.6#RE#2973;1#Reunion#...,,,,,"3.31534309946029,6.32228218966847,3.0069390902...",,"wc:1275,nwc:1255,c1.1:1,c1.2:3,c12.1:104,c12.1...",https://t.vietgiaitri.com/2022/2/1/thu-tuong-n...,https://i.vietgiaitri.com/2022/1/22/chu-tich-n...,,https://youtube.com/c/VGTTV?sub_confirmation=1...,,"Prime Minister Fast,20;New Year Life Here,88;P...","1000000,people learning work,3915;1000000000,U...",srclc:vie;eng:GT-VIE 1.0,<PAGE_LINKS>https://vietgiaitri.com/bo-giao-th...


## Requête 4

In [33]:
def requete_4(gkg_translation):
    
    requete4 = gkg_translation.loc[:,["gkg_translation_0", "gkg_translation_7", "gkg_translation_11", "gkg_translation_9", "gkg_translation_15", "gkg_translation_25"]]

    requete4.rename(columns={"gkg_translation_0" : "id_gkg_translation",
                             "gkg_translation_7" : "themes",
                            "gkg_translation_11" : "persons", 
                            "gkg_translation_9" : "locations",
                            "gkg_translation_15" : "avg_tone",
                            "gkg_translation_25" : "source_langue"}, inplace=True)

    requete4 = requete4.replace(to_replace='None', value=np.nan).dropna()
    requete4["source_langue"] = requete4["source_langue"].apply(lambda x: x.split(";")[0].split(":")[1])
    requete4["locations"] = requete4["locations"].apply(lambda x : str(x).split(",")[0].split("#")[-1])
    requete4["avg_tone"] = requete4["avg_tone"].apply(lambda x : float(str(x).split(",")[0]))
    
    # Drop all rows with full NaN values
    col = requete4.columns.tolist()
    requete4 = requete4.dropna(subset=col, how='all')
    
    requete4.to_csv(r'C:/HUGO/Ecole/Telecom Paris/COURS/INF_728_Base_de_donnees_non_relationnelles/GDELT Project/requete4.csv', index=False)


    return requete4

In [34]:
requete4 = requete_4(gkg_translation)

In [35]:
requete4

Unnamed: 0,id_gkg_translation,themes,persons,locations,avg_tone,source_langue
2,20220131111500-T2,TAX_FNCACT;TAX_FNCACT_SUPPORTERS;EXTREMISM;TAX...,johannes gutenberga,Prague,-0.915751,ces
3,20220131111500-T3,SANCTIONS;TAX_WORLDLANGUAGES;TAX_WORLDLANGUAGE...,susanne kass;simon clarke;vladimir putin;boris...,City Of London,-4.150943,ces
4,20220131111500-T4,MANMADE_DISASTER_IMPLIED;,cheb transmotel,Bochov,-2.837684,ces
6,20220131111500-T6,TAX_FNCACT;TAX_FNCACT_DRIVERS;TAX_ETHNICITY;TA...,cendisu john paroubek,Prague,-0.985222,ces
11,20220131111500-T11,EPU_POLICY;EPU_POLICY_POLITICAL;LEADER;TAX_FNC...,kais saied;khalil zawiya,FR,-7.368421,fra
...,...,...,...,...,...,...
1379,20220131230000-T1379,LEADER;,buster karno,Senayan,-7.291667,ind
1381,20220131230000-T1381,TAX_FNCACT;TAX_FNCACT_POLICE;EDUCATION;,sabang merauke,East Java,-7.027027,ind
1400,20220131230000-T1400,WB_137_WATER;TAX_ETHNICITY;TAX_ETHNICITY_CHINE...,europe asia,CH,-0.912863,vie
1401,20220131230000-T1401,TAX_FNCACT;TAX_FNCACT_MINISTER;LEADER;TAX_FNCA...,europe europe,Hanoi,0.372024,vie
