In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import requests
import pandas as pd
from datetime import datetime
import re

In [12]:
df = pd.read_csv('/content/drive/MyDrive/학교/Dissertation/Data Collection/dataset.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df

Unnamed: 0,page_id,title,text,quality
0,9941,Æthelberht of Kent,{{Short description|King of Kent (589–616)}}\n...,FA
1,128158,Macintosh Classic,{{Short description|Personal computer by Apple...,FA
2,4925,Blue whale,"{{Short description|Baleen whale, largest anim...",FA
3,311236,Water rail,{{short description|Species of bird}}\n{{Speci...,FA
4,91171,Hitler Diaries,{{short description|Forged journals purportedl...,FA
...,...,...,...,...
5195,98303,Hedetet,{{short description|Ancient Egyptian scorpion ...,Stub
5196,267460,Glass Eels,{{Use dmy dates|date=April 2022}}\n{{italic ti...,Stub
5197,250544,Carlos Manuel Pruneda,{{Short description|Cuban musician}}\n{{BLP so...,Stub
5198,218159,Romsås,"{{Short description|Neighborhood in Oslo, Norw...",Stub


In [13]:
def get_wikipedia_article_info(titles):
    def is_ip_address(user):
        # Check if a user is an IP address (IPv4 or IPv6)
        ipv4_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
        ipv6_pattern = re.compile(r'^(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}$')
        return ipv4_pattern.match(user) is not None or ipv6_pattern.match(user) is not None

    def get_revisions(page_id):
        endpoint = "https://en.wikipedia.org/w/api.php"
        revisions = []
        continue_flag = True
        rvcontinue = None

        while continue_flag:
            # Define parameters for the request (to get revisions)
            params = {
                'action': 'query',
                'format': 'json',
                'prop': 'revisions',
                'rvprop': 'timestamp|user|comment',
                'rvlimit': 'max',
                'pageids': page_id
            }
            if rvcontinue:
                params['rvcontinue'] = rvcontinue

            # Make the request to get revisions
            response = requests.get(endpoint, params=params)

            # Check if the request was successful
            if response.status_code != 200:
                raise Exception(f"Error: Unable to fetch revisions from Wikipedia API. Status code: {response.status_code}")

            # Parse the response
            data = response.json()
            page_revisions = data['query']['pages'][str(page_id)]['revisions']
            revisions.extend(page_revisions)

            # Check for continue flag for pagination
            if 'continue' in data:
                rvcontinue = data['continue']['rvcontinue']
            else:
                continue_flag = False

        return revisions

    def get_talk_page_id(title):
        endpoint = "https://en.wikipedia.org/w/api.php"
        params = {
            'action': 'query',
            'format': 'json',
            'prop': 'info',
            'titles': f"Talk:{title}"
        }

        response = requests.get(endpoint, params=params)
        if response.status_code != 200:
            raise Exception(f"Error: Unable to fetch data from Wikipedia API. Status code: {response.status_code}")

        data = response.json()
        pages = data['query']['pages']
        talk_page_id = list(pages.keys())[0]

        if talk_page_id == '-1':  # No talk page
            return None

        return talk_page_id

    # Define the endpoint
    endpoint = "https://en.wikipedia.org/w/api.php"

    # Define parameters for the first request (to get page IDs and basic info)
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'info',
        'titles': '|'.join(titles)
    }

    # Make the request to get page IDs and basic info
    response = requests.get(endpoint, params=params)

    # Check if the request was successful
    if response.status_code != 200:
        return f"Error: Unable to fetch data from Wikipedia API. Status code: {response.status_code}"

    # Parse the response
    data = response.json()
    pages = data['query']['pages']

    # Prepare a structure to hold the article information
    article_info = []

    for page_id, page in pages.items():
        if 'missing' in page:
            continue  # Skip missing pages

        title = page.get('title')
        page_id = page.get('pageid')

        # Get revisions for the main article
        revisions = get_revisions(page_id)

        # Get revisions for the talk page
        talk_page_id = get_talk_page_id(title)
        talk_revisions = get_revisions(talk_page_id) if talk_page_id else []

        # Calculate the number of edits
        num_edits = len(revisions)

        # Calculate the number of unique editors
        editors = set(rev['user'] for rev in revisions if 'user' in rev)
        num_editors = len(editors)

        # Separate registered and anonymous editors
        registered_editors = set()
        anonymous_editors = set()
        editor_contributions = {}

        for rev in revisions:
            user = rev.get('user')
            if user:
                if is_ip_address(user):
                    anonymous_editors.add(user)
                else:
                    registered_editors.add(user)
            else:
                anonymous_editors.add('anonymous')  # Fallback in case user is missing completely

            if user not in editor_contributions:
                editor_contributions[user] = 0
            editor_contributions[user] += 1

        # Calculate occasional editors (less than 3 edits)
        occasional_editors = {user for user, count in editor_contributions.items() if count < 3}

        # Count reverts
        revert_keywords = ["revert", "undid", "rv"]
        revert_count = sum(1 for rev in revisions if any(keyword in rev.get('comment', '').lower() for keyword in revert_keywords))

        # Calculate the number of discussions (edits in the talk page)
        discussion_count = len(talk_revisions)

        # Get the creation date of the article
        creation_date = datetime.strptime(revisions[-1]['timestamp'], "%Y-%m-%dT%H:%M:%SZ")

        # Calculate the age of the article
        article_age = datetime.now() - creation_date

        # Append the information to the list
        article_info.append({
            'title': title,
            'page_id': page_id,
            'article_age_days': article_age.days,
            'num_edits': num_edits,
            'num_editors': num_editors,
            'num_registered_editors': len(registered_editors),
            'num_anonymous_editors': len(anonymous_editors),
            'num_occasional_editors': len(occasional_editors),
            'revert_count': revert_count,
            'discussion_count': discussion_count
        })

    return article_info

In [14]:
# Example usage
titles = ["Blue whale"]
info = get_wikipedia_article_info(titles)
print(info)

[{'title': 'Blue whale', 'page_id': 4925, 'article_age_days': 8209, 'num_edits': 4494, 'num_editors': 1833, 'num_registered_editors': 1138, 'num_anonymous_editors': 695, 'num_occasional_editors': 1554, 'revert_count': 790, 'discussion_count': 808}]


In [16]:
review1 = []
for i in range(0, 2000, 50):
    lst = df['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review1.append(df_add)

KeyboardInterrupt: 

In [None]:
len(review1)

In [None]:
review2 = []
for i in range(2000, 4000, 50):
    lst = df['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review2.append(df_add)

In [None]:
len(review2)

In [None]:
review3 = []
for i in range(4000, 5200, 50):
    lst = df['title'].iloc[i:i+50].tolist()
    df_add = get_wikipedia_article_info(lst)
    review3.append(df_add)

In [None]:
len(review3)

In [None]:
df_review = pd.DataFrame(review[0])
for i in range(1, len(review)):
    df_add = pd.DataFrame(review[i])
    df_review = pd.concat([df_review, df_add])

In [None]:
len(df_review)

In [None]:
df_review

In [None]:
df_review.isnull().sum()