In [9]:
import requests
import pandas as pd
import os

def get_openalex_data(year):
    url = f"https://api.openalex.org/works?filter=from_publication_date:{year}-01-01,to_publication_date:{year}-12-31&per-page=200"
    r = requests.get(url).json()
    return r["results"]

all_data = []
for year in range(2019, 2026):
    data = get_openalex_data(year)
    for item in data:
        # Safely get 'journal' name
        journal_name = item.get("host_venue", {}).get("display_name")

        # Safely get 'field' name
        field_name = None
        concepts = item.get("concepts")
        if concepts and isinstance(concepts, list) and len(concepts) > 0:
            field_name = concepts[0].get("display_name")

        # Safely get 'open_access' status
        open_access_status = item.get("open_access", {}).get("is_oa")

        # Safely get 'abstract'
        abstract_inverted_index = item.get("abstract_inverted_index")
        abstract_text = None
        if abstract_inverted_index:
            # Reconstruct abstract from inverted index if available
            words_by_position = {}
            for word, positions in abstract_inverted_index.items():
                for pos in positions:
                    words_by_position[pos] = word
            # Sort words by their original position and join them
            abstract_text = " ".join(words_by_position[pos] for pos in sorted(words_by_position.keys()))

        all_data.append({
            "paper_id": item["id"],
            "title": item["title"],
            "year": year,
            "citations": item["cited_by_count"],
            "journal": journal_name,
            "field": field_name,
            "open_access": open_access_status,
            "abstract": abstract_text
        })

df = pd.DataFrame(all_data)

# Create the directory if it doesn't exist
os.makedirs('data_raw', exist_ok=True)

df.to_csv("data_raw/openalex_base.csv", index=False)

In [10]:
ai_keywords = [
    "chatgpt",
    "generative ai",
    "large language model",
    "llm",
    "ai-assisted",
    "gpt"
]

def label_ai(text):
    if text is None:
        return 0
    text = text.lower()
    return int(any(k in text for k in ai_keywords))

df["AI_assisted"] = df["abstract"].apply(label_ai)

In [23]:
import textstat
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

def nlp_features(text):
    if text is None:
        return {
            "flesch": None,
            "fog": None,
            "sentence_len": None,
            "lexical_diversity": None
        }
    try:
        return {
            "flesch": textstat.flesch_reading_ease(text),
            "fog": textstat.gunning_fog(text),
            "sentence_len": len(word_tokenize(text)) / max(1, len(sent_tokenize(text))),
            "lexical_diversity": len(set(word_tokenize(text))) / max(1, len(word_tokenize(text)))
        }
    except ZeroDivisionError:
        # Handle cases where sent_tokenize might return an empty list or other edge cases
        return {
            "flesch": None,
            "fog": None,
            "sentence_len": None,
            "lexical_diversity": None
        }

In [13]:
!pip install textstat nltk

Collecting textstat
  Downloading textstat-0.7.12-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.12-py3-none-any.whl (176 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.6/176.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.12


In [16]:
import statsmodels.formula.api as smf

df["Post"] = (df["year"] >= 2022).astype(int)
df["interaction"] = df["AI_assisted"] * df["Post"]

# Temporarily remove C(field) to diagnose the error
model = smf.ols(
    "citations ~ AI_assisted + Post + interaction + open_access",
    data=df
).fit(cov_type="HC3")

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              citations   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     9.147
Date:                Fri, 23 Jan 2026   Prob (F-statistic):           2.70e-07
Time:                        10:40:52   Log-Likelihood:                -15983.
No. Observations:                1400   AIC:                         3.198e+04
Df Residuals:                    1395   BIC:                         3.200e+04
Df Model:                           4                                         
Covariance Type:                  HC3                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept            6489.9277    

In [17]:
df["event_time"] = df["year"] - 2022


In [18]:
model_fe = smf.ols(
    "citations ~ AI_assisted + C(paper_id) + C(year)",
    data=df
).fit()


In [24]:
df_nlp_features = df['abstract'].apply(nlp_features).apply(pd.Series)
df = pd.concat([df, df_nlp_features], axis=1)

m1 = smf.ols("flesch ~ AI_assisted", data=df).fit()
m2 = smf.ols("citations ~ flesch + AI_assisted", data=df).fit()

In [27]:
import pandas as pd

# Group by AI_assisted and calculate statistics
summary_stats = df.groupby('AI_assisted').agg(
    avg_citations=('citations', 'mean'),
    median_citations=('citations', 'median'),
    open_access_rate=('open_access', lambda x: x.sum() / len(x))
).round(2)

# Rename AI_assisted column for clarity
summary_stats.rename(index={0: 'Non-AI assisted', 1: 'AI-assisted'}, inplace=True)

print("Makale İstatistikleri (AI ve Non-AI olarak):")
print(summary_stats)

print("\nOrtalama yazar sayısı ve tekrar baskı oranı verileri mevcut değildir.")

Makale İstatistikleri (AI ve Non-AI olarak):
                 avg_citations  median_citations  open_access_rate
AI_assisted                                                       
Non-AI assisted        4594.21            2523.0              0.68
AI-assisted            2226.21            1241.0              0.79

Ortalama yazar sayısı ve tekrar baskı oranı verileri mevcut değildir.


In [26]:
# Toplam makale sayısı
N = len(df)
print(f"Toplam makale sayısı: N = {N}")

# AI-assisted makale sayısı
N_AI = df['AI_assisted'].sum()
print(f"AI-assisted makale sayısı: N_AI = {N_AI}")

# Non-AI makale sayısı
N_nonAI = N - N_AI
print(f"Non-AI makale sayısı: N_nonAI = {N_nonAI}")

# Yıl aralığı
min_year = df['year'].min()
max_year = df['year'].max()
print(f"Yıllar: {min_year}–{max_year}")

Toplam makale sayısı: N = 1400
AI-assisted makale sayısı: N_AI = 43
Non-AI makale sayısı: N_nonAI = 1357
Yıllar: 2019–2025


In [25]:
print(m1.summary())
print(m2.summary())

                            OLS Regression Results                            
Dep. Variable:                 flesch   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                  0.001902
Date:                Fri, 23 Jan 2026   Prob (F-statistic):              0.965
Time:                        10:51:41   Log-Likelihood:                -4314.3
No. Observations:                 889   AIC:                             8633.
Df Residuals:                     887   BIC:                             8642.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      17.9553      1.067     16.826      

In [21]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [29]:
# The previous code was a syntax error. If you intended to run a regression with fixed effects,
# you might need to use a format similar to this, assuming 'citations_it', 'AI_it' are column names
# and 'paper_id' and 'year' are the variables for fixed effects.
# import numpy as np
# model_fixed_effects = smf.ols("np.log(1 + citations) ~ AI_assisted + C(paper_id) + C(year)", data=df).fit()
# print(model_fixed_effects.summary())

In [30]:
import numpy as np
import statsmodels.formula.api as smf

df["log_citations"] = np.log(1 + df["citations"])

model_fe = smf.ols(
    "log_citations ~ AI_assisted + C(paper_id) + C(year)",
    data=df
).fit()

print(model_fe.summary())


  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


                            OLS Regression Results                            
Dep. Variable:          log_citations   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Fri, 23 Jan 2026   Prob (F-statistic):                nan
Time:                        11:07:18   Log-Likelihood:                 39164.
No. Observations:                1400   AIC:                        -7.553e+04
Df Residuals:                       0   BIC:                        -6.819e+04
Df Model:                        1399                                         
Covariance Type:            nonrobust                                         
                                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------

In [32]:
import numpy as np
import statsmodels.formula.api as smf

df["log_citations"] = np.log(1 + df["citations"])

# Removing C(field) as it causes issues with the F-test calculation
model_main = smf.ols(
    "log_citations ~ AI_assisted + C(year) + open_access",
    data=df
).fit(cov_type="HC3")

print(model_main.summary())

                            OLS Regression Results                            
Dep. Variable:          log_citations   R-squared:                       0.637
Model:                            OLS   Adj. R-squared:                  0.635
Method:                 Least Squares   F-statistic:                     245.0
Date:                Fri, 23 Jan 2026   Prob (F-statistic):          3.20e-259
Time:                        11:17:05   Log-Likelihood:                -1401.9
No. Observations:                1400   AIC:                             2822.
Df Residuals:                    1391   BIC:                             2869.
Df Model:                           8                                         
Covariance Type:                  HC3                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               8.4433    

In [33]:
import statsmodels.formula.api as smf

models = {
    "Flesch": "flesch ~ AI_assisted + C(year) + C(field)",
    "Fog": "fog ~ AI_assisted + C(year) + C(field)",
    "SentenceLength": "sentence_len ~ AI_assisted + C(year) + C(field)"
}

results = {}

for name, formula in models.items():
    model = smf.ols(formula, data=df).fit(cov_type="HC3")
    coef = model.params["AI_assisted"]
    se = model.bse["AI_assisted"]
    p = model.pvalues["AI_assisted"]
    results[name] = (coef, se, p)

for k, v in results.items():
    print(k, "→ β:", v[0], "SE:", v[1], "p:", v[2])


  self.het_scale = (self.wresid / (1 - h))**2
  scale[:, None] * self.model.pinv_wexog.T)
  self.het_scale = (self.wresid / (1 - h))**2
  scale[:, None] * self.model.pinv_wexog.T)


Flesch → β: 3.742681972171127 SE: inf p: 1.0
Fog → β: -0.35717864934265803 SE: inf p: 1.0
SentenceLength → β: -1.7026506380626867 SE: inf p: 1.0


  self.het_scale = (self.wresid / (1 - h))**2
  scale[:, None] * self.model.pinv_wexog.T)


In [34]:
models = {
    "Flesch": "flesch ~ AI_assisted + C(year)",
    "Fog": "fog ~ AI_assisted + C(year)",
    "SentenceLength": "sentence_len ~ AI_assisted + C(year)"
}

for name, formula in models.items():
    model = smf.ols(formula, data=df).fit()
    print("\n", name)
    print("β:", model.params["AI_assisted"])
    print("SE:", model.bse["AI_assisted"])
    print("p:", model.pvalues["AI_assisted"])



 Flesch
β: 1.920373951295227
SE: 5.024898194867221
p: 0.7024261706025542

 Fog
β: -0.5238355149012883
SE: 1.5250397936326232
p: 0.7313120149796499

 SentenceLength
β: -2.063417018900648
SE: 4.528962562205764
p: 0.6487864696831747
