__Files created__

- text_api.csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Extracting text data through the wikipedia API

In [None]:
import pandas as pd
import requests

In [None]:
data = pd.read_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection2/dataset.csv')
data

Unnamed: 0,title,page_id,quality
0,Mayan languages,182013,FA
1,Mu'awiya I,207068,FA
2,The Fountainhead,180464,FA
3,Northern pintail,218361,FA
4,Manhattan Project,19603,FA
...,...,...,...
5195,Party of Democratic Kampuchea,265468,Stub
5196,Minawara and Multultu,95240,Stub
5197,Theophylline/ephedra/hydroxyzine,262652,Stub
5198,"Channel Lake, Illinois",111450,Stub


In [None]:
# Function to extract text and page ID
def get_wikipedia_articles(titles):
    # Define the endpoint URL
    endpoint = "https://en.wikipedia.org/w/api.php"

    # Prepare the list to store all articles
    all_articles = []

    for title in titles:
        # Define parameters for each title
        params = {
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": True
        }

        # Make the request
        response = requests.get(endpoint, params=params)

        # Check if the request was successful
        if response.status_code != 200:
            all_articles.append({
                'title': title,
                'error': f"Error: Unable to fetch data from Wikipedia API. Status code: {response.status_code}"
            })
            continue

        # Parse the response
        data = response.json()
        pages = data['query']['pages']

        # Extract relevant information
        for page_id, page_info in pages.items():
            title = page_info.get('title')
            page_id = page_info.get('pageid')
            extract = page_info.get('extract', '')

            all_articles.append({
                'title': title,
                'page_id': page_id,
                'text': extract
            })

    return all_articles

In [None]:
# Example
titles = ["Channel Lake, Illinois"]
page_info = get_wikipedia_articles(titles)
print(page_info)

[{'title': 'Channel Lake, Illinois', 'page_id': 111450, 'text': "Channel Lake is an unincorporated community and census-designated place (CDP) in Antioch Township, Lake County, Illinois, United States. Per the 2020 census, the population was 1,581.\n\n\n== Geography ==\nChannel Lake is located in northwestern Lake County at 42°28′40″N 88°8′55″W, on the northwest and southwest sides of Channel Lake, part of the Chain O'Lakes system of lakes in northern Illinois leading to the Fox River. It is bordered to the east by the community of Lake Catherine and to the north by the village of Salem Lakes, Wisconsin.\nAccording to the United States Census Bureau, the Channel Lake CDP has a total area of 2.4 square miles (6.2 km2), of which 1.8 square miles (4.7 km2) are land and 0.6 square miles (1.6 km2), or 24.90%, are water.\n\n\n== Demographics ==\n\n\n=== 2020 census ===\n\n\n=== 2000 Census ===\nAs of the census of 2000, there were 1,785 people, 696 households, and 473 families residing in th

In [None]:
text1 = []
for i in range(0, 2000, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_articles(lst)
    text1.append(df_add)

In [None]:
len(text1)

40

In [None]:
text2 = []
for i in range(2000, 4000, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_articles(lst)
    text2.append(df_add)

In [None]:
len(text2)

40

In [None]:
text3 = []
for i in range(4000, 5200, 50):
    lst = data['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_articles(lst)
    text3.append(df_add)

In [None]:
len(text3)

24

In [None]:
text = []
lst = [text1, text2]
for i in lst:
    for j in range(40):
        text.append(i[j])

In [None]:
len(text)

80

In [None]:
for i in range(24):
    text.append(text3[i])

In [None]:
len(text)

104

In [None]:
# Combine all collected data into a dataframe
df_text = pd.DataFrame(text[0])
for i in range(1, len(text)):
    df_add = pd.DataFrame(text[i])
    df_text = pd.concat([df_text, df_add])

In [None]:
len(df_text)

5200

In [None]:
df_text.isnull().sum()

title      0
page_id    0
text       0
dtype: int64

In [None]:
df_text.reset_index(drop=True, inplace=True)
df_text

Unnamed: 0,title,page_id,text
0,Mayan languages,182013,The Mayan languages form a language family spo...
1,Mu'awiya I,207068,"Mu'awiya I (Arabic: معاوية بن أبي سفيان, roman..."
2,The Fountainhead,180464,The Fountainhead is a 1943 novel by Russian-Am...
3,Northern pintail,218361,The pintail or northern pintail (Anas acuta) i...
4,Manhattan Project,19603,The Manhattan Project was a research and devel...
...,...,...,...
5195,Party of Democratic Kampuchea,265468,The Party of Democratic Kampuchea was a politi...
5196,Minawara and Multultu,95240,"In Aboriginal mythology, Minawara and Multultu..."
5197,Theophylline/ephedra/hydroxyzine,262652,Theophylline/ephedra/hydroxyzine (trade name M...
5198,"Channel Lake, Illinois",111450,Channel Lake is an unincorporated community an...


In [None]:
df_text.to_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection2/text_api.csv', index=False)

- Check whether page ID matches

In [None]:
combined = data.merge(df_text, on='title')
combined

Unnamed: 0,title,page_id_x,quality,page_id_y,text
0,Mayan languages,182013,FA,182013,The Mayan languages form a language family spo...
1,Mu'awiya I,207068,FA,207068,"Mu'awiya I (Arabic: معاوية بن أبي سفيان, roman..."
2,The Fountainhead,180464,FA,180464,The Fountainhead is a 1943 novel by Russian-Am...
3,Northern pintail,218361,FA,218361,The pintail or northern pintail (Anas acuta) i...
4,Manhattan Project,19603,FA,19603,The Manhattan Project was a research and devel...
...,...,...,...,...,...
5195,Party of Democratic Kampuchea,265468,Stub,265468,The Party of Democratic Kampuchea was a politi...
5196,Minawara and Multultu,95240,Stub,95240,"In Aboriginal mythology, Minawara and Multultu..."
5197,Theophylline/ephedra/hydroxyzine,262652,Stub,262652,Theophylline/ephedra/hydroxyzine (trade name M...
5198,"Channel Lake, Illinois",111450,Stub,111450,Channel Lake is an unincorporated community an...


In [None]:
combined[combined['page_id_x']!=combined['page_id_y']]

Unnamed: 0,title,page_id_x,quality,page_id_y,text
