# Clean the Extracted wikidump files

In [1]:
import re
import pandas as pd
import os

In [2]:
# open 1 of the extracted docs
# with open('../../../enwiki_extracted/ms2/AA/wiki_00', 'r') as fh:
#     doc = fh.read()

In [3]:
# split per article
# docs = doc.split('</doc>')

In [4]:
# define functions for extracting metadata
def find_id(string):
    """function to extract id from article"""
    
    id_pattern = 'id=\"\d+\"'
#     id_float = '[0-9]+'
    
    short_string = re.findall(id_pattern, string)[0]
    quotation_sym = '"'
    start = short_string.index(quotation_sym) + len(quotation_sym)
    end = short_string.index(quotation_sym, start + 1)
    actual_id = int(short_string[start:end])

#     actual_id = re.findall(id_float, short_string)
    
    return actual_id
 
def find_url(string):
    """function to extract url from article"""
    
    url_pattern = 'https://en\.wikipedia\.org/[a-zA-z\?=\d]+'
    
    url = re.findall(url_pattern, string)
    
    return url[0]

def find_title(string):
    """function to extract title from article"""
    
    pattern = 'title=\"[^>]+'
    short_string = re.findall(pattern, string)[0]
    
    quotation_sym = '"'
    start = short_string.index(quotation_sym) + len(quotation_sym)
    end = short_string.index(quotation_sym, start + 1)
    title = short_string[start:end]
    
    return title

In [5]:
# some sort of attempt at loading files
path = "../../../enwiki_extracted/ms1/AA" 

wikidump = []

# expand this to go over directories as well
for filename in os.listdir(path): 
    f = os.path.join(path,filename)
    with open(f, 'r') as fh:
        wikidump.append(fh.read())

***

In [6]:
wikidump = [dump.split('</doc>') for dump in wikidump]
wikidump = [article for dump in wikidump for article in dump]

In [7]:
len(wikidump)

7412

# Processing

In [8]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('\w+')

In [9]:
# load cities csv
fp = '../input/List_of_cities_300k.csv'

cities = pd.read_csv(fp, sep=';')

# cities = cities[cities.SizeMUA1k >= 300] # only cities with a population of 300.000 or more

In [10]:
cities.head()

Unnamed: 0,id_MUA,Mua,Mua_Eng,SizeMUA1k,EU15,Code_Country,NUTS_1,NUTS_2,NUTS_3,X_COORD_first,...,PUR,Supra_poly_fua,PIA,Name_supra_poly_fua,Poly_fua,id_poly_fua,Name_poly_fua,SizeHinterland1k,GDP_per_capita,Dummy_Capital
0,FR00810,Paris,Paris,9591,1,FR,FR1,FR10,FR101,-923451,...,,0,PIA_Paris,99,0,,,1201,44,1
1,UK01886,London,London,8256,1,UK,UKI,UKI1,UKI11,-1039874,...,,0,PIA_London,99,0,,,2752,45,1
2,ES00540,Madrid,Madrid,4955,1,ES,ES3,ES30,ES300,-1573058,...,,0,PIA_Madrid,99,0,,,308,29,1
3,DE00219,Berlin,Berlin,3776,1,DE,DE3,DE30,DE300,-108502,...,,0,PIA_Berlin,99,0,,,240,22,1
4,IT01156,Milano,Milan,3698,1,IT,ITC,ITC4,ITC45,-455713,...,Milano,0,PIA_Milano,99,1,IT03,Milano metropolitan area,604,35,0


In [19]:
# list of all cities
cities_ls = [city for city in cities.Mua_Eng]

# split combined citynames 
stopwords = ['a', 'The', 'the', 'A', 'de', 'di', 'en', 'am']
cities_ls = [tokenizer.tokenize(city) for city in cities_ls]
cities_ls = [city_component for city in cities_ls for city_component in city]
cities_ls = [word for word in cities_ls if not word in stopwords]

In [21]:
def list_in_corpus(list_of_words, text_corpus):
    inclusion = False
    for word in list_of_words:
        if word in text_corpus:
            inclusion = True
        else: 
            pass
    return inclusion

In [23]:
articles = []

# info we have article id, article url, article title
for article in wikidump: 
    try: 
        article_id = find_id(article)
        url = find_url(article)
        title = find_title(article)
        tokenized_text = tokenizer.tokenize(article)
        # need to add some more cleaning of text / exclude stopwords / replace accents/etc/ maybe only certain POS-tags
        if (len(tokenized_text) > 20) & list_in_corpus(cities_ls, article): 
            articles.append((article_id, title, url, tokenized_text))
        else: 
            pass
    except:
        pass

In [24]:
len(articles) # 5393

2899

In [25]:
# create df of doc
df_articles = pd.DataFrame(articles, columns = ['article_id', 'title', 'url', 'text'])
df_articles.head()

Unnamed: 0,article_id,title,url,text
0,8683,Disc jockey,https://en.wikipedia.org/wiki?curid=8683,"[doc, id, 8683, url, https, en, wikipedia, org..."
1,8687,Detroit,https://en.wikipedia.org/wiki?curid=8687,"[doc, id, 8687, url, https, en, wikipedia, org..."
2,8688,Deccan Traps,https://en.wikipedia.org/wiki?curid=8688,"[doc, id, 8688, url, https, en, wikipedia, org..."
3,8691,Divination,https://en.wikipedia.org/wiki?curid=8691,"[doc, id, 8691, url, https, en, wikipedia, org..."
4,8693,Diets of Nuremberg,https://en.wikipedia.org/wiki?curid=8693,"[doc, id, 8693, url, https, en, wikipedia, org..."


In [26]:
list_in_corpus(cities_ls, df_articles.text[1])

True

In [135]:
df = df_articles[['article_id', 'title', 'text']]

In [136]:
df.to_csv('../../../enwikitrial1folder_exclstumps.csv')