In [1]:
import pandas as pd

In [2]:
# load data
df = pd.read_csv("data/publications.csv")

# Fix affiliations

### add missing affiliations based on found ones

In [3]:
affiliations = df[["name", "affiliation_clean", "Fakultät", "Institut / Lehrstuhl / Fachbereich / Ptrofessur", "AG / Bereich", "Adresse", "Homepage", "Notiz"]].drop_duplicates()

In [4]:
affiliations = affiliations[~affiliations["affiliation_clean"].isna()]

In [5]:
affiliations = affiliations.drop_duplicates(subset=['name'], keep=False)

In [6]:
affiliations_dict = affiliations.to_dict("records")

In [7]:
affiliations_dict_new = {}
for i in affiliations_dict:
    if pd.isnull(i["affiliation_clean"]):
        continue
    else:
        affiliations_dict_new[i["name"]] = i

In [8]:
for index, row in df.iterrows():
    if pd.isnull(row["affiliation_clean"]):
        clean_aff = affiliations_dict_new.get(row["name"])
        if clean_aff:
            df.at[index, "affiliation_clean"] = clean_aff["affiliation_clean"]
            df.at[index, "Fakultät"] = clean_aff["Fakultät"]
            df.at[index, "Institut / Lehrstuhl / Fachbereich / Ptrofessur"] = clean_aff["Institut / Lehrstuhl / Fachbereich / Ptrofessur"]
            df.at[index, "AG / Bereich"] = clean_aff["AG / Bereich"]
            df.at[index, "Adresse"] = clean_aff["Adresse"]
            df.at[index, "Homepage"] = clean_aff["Homepage"]
            df.at[index, "Notiz"] = clean_aff["Notiz"]

In [9]:
df.groupby("name")["affiliation_clean"].nunique().sort_values(ascending=False).head(10)

name
Ingo Scholtes         2
Anna-Marie Ortloff    2
Maik Fröbe            2
Golsa Heidari         1
Hannah Bast           1
Ricardo Usbeck        1
Richard Meyes         1
Rishiraj Saha Roy     1
Hajira Jabeen         1
Hai Dang Tran         1
Name: affiliation_clean, dtype: int64

# Analysis

### Top 10

In [10]:
df["New_Group_Name"] = ""
for index, row in df.iterrows():
    if row["AG / Bereich"] == "Webis Group":
        df.at[index, "New_Group_Name"] = "| Webis Group | | | |"
    else:

        df.at[index, "New_Group_Name"] = "| "+str(row['affiliation_clean']) +" | "+ str(row['Fakultät']) +" | "+ str(row['Institut / Lehrstuhl / Fachbereich / Ptrofessur']) +" | "+ str(row['AG / Bereich']) +" |"

In [11]:
df.groupby(["New_Group_Name"])["doi"].nunique().sort_values(ascending=False).head(11)  # 11 because of the "nan" category

New_Group_Name
| nan | nan | nan | nan |                                                                                                                                                                             294
| Webis Group | | | |                                                                                                                                                                                  37
| Max Planck Institute | nan | nan | Databases and Information Systems |                                                                                                                               30
| nan | nan | nan | Forschungszentrum L3S |                                                                                                                                                            28
| nan | nan | nan | GESIS - Leibniz-Institut für Sozialwissenschaften |                                                                                                          

In [12]:
df.groupby(["New_Group_Name"])["doi"].nunique().sort_values(ascending=False).head(11).to_frame().reset_index()["New_Group_Name"].tolist()[1:]

['| Webis Group | | | |',
 '| Max Planck Institute | nan | nan | Databases and Information Systems |',
 '| nan | nan | nan | Forschungszentrum L3S |',
 '| nan | nan | nan | GESIS - Leibniz-Institut für Sozialwissenschaften |',
 '| TIB | nan | nan | Forschungsgruppe Visual Analytics |',
 '| Universität Bonn | Mathematisch-Naturwissenschaftliche Fakultät | Institut für Informatik | Data Science & Intelligent Systems (DSIS) research group |',
 '| Universität Regensburg | Fakultät für Sprach-, Literatur- und Kulturwissenschaften | Institut für Information und Medien, Sprache und Kultur (I:IMSK) | Lehrstuhl für Informationswissenschaft |',
 '| Universität Mannheim | Fakultät für Wirtschaftsinformatik und Wirtschaftsmathematik | Institut für Informatik und Wirtschaftsinformatik | Data and Web Science Group |',
 '| nan | nan | nan | Bosch Center for Artificial Intelligence |',
 '| TH Köln | Institut für Informationswissenschaft & Institut für Informationsmanagement | nan | Information Retriev

In [13]:
names = {
    '| Webis Group | | | |': "Webis Group",
    '| Max Planck Institute | nan | nan | Databases and Information Systems |': "Max Planck Institute - Databases and Information Systems",
    '| nan | nan | nan | Forschungszentrum L3S |':"Forschungszentrum L3S" ,
    '| nan | nan | nan | GESIS - Leibniz-Institut für Sozialwissenschaften |': "GESIS - Leibniz-Institut für Sozialwissenschaften",
    '| TIB | nan | nan | Forschungsgruppe Visual Analytics |': "TIB - Forschungsgruppe Visual Analytics",
    '| Universität Bonn | Mathematisch-Naturwissenschaftliche Fakultät | Institut für Informatik | Data Science & Intelligent Systems (DSIS) research group |': "Universität Bonn - Data Science & Intelligent Systems (DSIS) research group",
    '| Universität Regensburg | Fakultät für Sprach-, Literatur- und Kulturwissenschaften | Institut für Information und Medien, Sprache und Kultur (I:IMSK) | Lehrstuhl für Informationswissenschaft |': "Universität Regensburg - Lehrstuhl für Informationswissenschaft",
    '| Universität Mannheim | Fakultät für Wirtschaftsinformatik und Wirtschaftsmathematik | Institut für Informatik und Wirtschaftsinformatik | Data and Web Science Group |': "Universität Mannheim - Data and Web Science Group",
    '| nan | nan | nan | Bosch Center for Artificial Intelligence |': "Bosch Center for Artificial Intelligence",
    '| TH Köln | Institut für Informationswissenschaft & Institut für Informationsmanagement | nan | Information Retrieval Research Group |': "TH Köln - Information Retrieval Research Group"
 }

In [14]:
df["New_Group_Name"] = df["New_Group_Name"].replace(names)

In [15]:
df.groupby(["New_Group_Name"])["doi"].nunique().sort_values(ascending=False).head(11).to_frame().reset_index()[1:]

Unnamed: 0,New_Group_Name,doi
1,Webis Group,37
2,Max Planck Institute - Databases and Informati...,30
3,Forschungszentrum L3S,28
4,GESIS - Leibniz-Institut für Sozialwissenschaften,13
5,TIB - Forschungsgruppe Visual Analytics,13
6,Universität Bonn - Data Science & Intelligent ...,11
7,Universität Regensburg - Lehrstuhl für Informa...,10
8,Bosch Center for Artificial Intelligence,10
9,Universität Mannheim - Data and Web Science Group,10
10,TH Köln - Information Retrieval Research Group,9


In [16]:
top10_groups = names.values()

In [17]:
top10_groups

dict_values(['Webis Group', 'Max Planck Institute - Databases and Information Systems', 'Forschungszentrum L3S', 'GESIS - Leibniz-Institut für Sozialwissenschaften', 'TIB - Forschungsgruppe Visual Analytics', 'Universität Bonn - Data Science & Intelligent Systems (DSIS) research group', 'Universität Regensburg - Lehrstuhl für Informationswissenschaft', 'Universität Mannheim - Data and Web Science Group', 'Bosch Center for Artificial Intelligence', 'TH Köln - Information Retrieval Research Group'])

In [18]:
top10 = df[df["New_Group_Name"].isin(top10_groups)]

In [23]:
top10.drop_duplicates("doi")["num_authors"].describe()

count    156.000000
mean       4.833333
std        3.244516
min        1.000000
25%        3.000000
50%        4.000000
75%        6.000000
max       17.000000
Name: num_authors, dtype: float64

### # publications per group

In [34]:
def pubs_by_group(group):
    a = top10[top10["New_Group_Name"]==group].groupby("doi").count().index.tolist()
    return top10[top10["doi"].isin(a)][["title", "doi", "acronym", "year", "num_authors"]].sort_values(by="title").drop_duplicates("doi").reset_index(drop=True)

In [35]:
table = {}
for group in top10_groups:
    g = pubs_by_group(group)

    table[group] = {
        "min_authors": round(g["num_authors"].min()),
        "mean_authors": round(g["num_authors"].mean()),
        "max_authors": round(g["num_authors"].max()),
        "Publications": round(g.count()["title"]),
        "SIGIR": round(g[g["acronym"] == "SIGIR"].count()["title"]),
        "CIKM": round(g[g["acronym"] == "CIKM"].count()["title"]),
        "WWW": round(g[g["acronym"] == "WWW"].count()["title"]),
        "ECIR": round(g[g["acronym"] == "ECIR"].count()["title"]),
        "CHIIR": round(g[g["acronym"] == "CHIIR"].count()["title"]),
        "CLEF": round(g[g["acronym"] == "CLEF"].count()["title"]),
    }

In [36]:
pd.DataFrame(table).T

Unnamed: 0,min_authors,mean_authors,max_authors,Publications,SIGIR,CIKM,WWW,ECIR,CHIIR,CLEF
Webis Group,3,8,17,37,6,6,0,13,5,7
Max Planck Institute - Databases and Information Systems,1,3,6,30,13,7,4,5,1,0
Forschungszentrum L3S,1,4,12,28,3,10,14,0,1,0
GESIS - Leibniz-Institut für Sozialwissenschaften,3,5,12,13,1,4,3,0,5,0
TIB - Forschungsgruppe Visual Analytics,3,5,12,13,3,3,3,2,2,0
Universität Bonn - Data Science & Intelligent Systems (DSIS) research group,2,5,12,11,1,6,3,0,1,0
Universität Regensburg - Lehrstuhl für Informationswissenschaft,1,3,6,10,0,0,0,3,7,0
Universität Mannheim - Data and Web Science Group,2,3,6,10,0,4,5,0,1,0
Bosch Center for Artificial Intelligence,3,6,10,10,1,5,4,0,0,0
TH Köln - Information Retrieval Research Group,2,4,7,9,2,0,0,3,0,4


### Top Tf-IDF topics

In [125]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, PorterStemmer
from nltk.corpus import stopwords
import nltk

In [137]:
stemmer = SnowballStemmer('english')
# Initialize Porter stemmer
stemmer = PorterStemmer()

def custom_tokenizer(text):
    words = word_tokenize(text)

    # Stem words
    # stemmed_words = [stemmer.stem(word) for word in words]
    
    # Remove stopwords
    # filtered_words = [word for word in stemmed_words if word not in stopwords.words('english')]
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    
    # Create bi-grams
    bi_grams = [' '.join(bi) for bi in zip(filtered_words, filtered_words[1:])]
    
    return filtered_words + bi_grams

vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

In [138]:
def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [139]:
top10["title_clean"] = top10["title"].apply(remove_punctuation)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top10["title_clean"] = top10["title"].apply(remove_punctuation)


In [140]:
docs = top10.groupby("New_Group_Name").agg({"title_clean": " ".join})

In [141]:
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

In [142]:
tfidf_matrix = vectorizer.fit_transform(docs["title_clean"].tolist())



In [143]:
vectorizer.get_feature_names_out()

array(['19', '19 years', '1st', ..., 'years', 'years answering',
       'years conversational'], dtype=object)

In [144]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [145]:
docs.reset_index(inplace=True)
docs = pd.concat([docs, tfidf_df], axis=1)
docs.set_index("New_Group_Name", inplace=True)
docs = docs.drop("title_clean", axis=1)

In [146]:
top_100 = {}
for c in docs.T.columns:
    top = docs.T[c].sort_values(ascending=False)[:100].index.tolist()
    top = [x for x in top if not x.isnumeric()]
    top_100[c] = top[:20]

In [151]:
pd.DataFrame(top_100).head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Bosch Center for Artificial Intelligence,welding,machine,machine learning,knowledge,knowledge graph,graph,executable knowledge,executable,enhancing knowledge,enhancing
Forschungszentrum L3S,neural,using,forward,ranking using,model,indexes,using forward,forward indexes,neural ranking,efficient neural
GESIS - Leibniz-Institut für Sozialwissenschaften,language queries,knowledge,knowledge base,base,natural language,natural,language,tweetscov19,covid19,queries
Max Planck Institute - Databases and Information Systems,answering,conversational question,question answering,question,knowledge,conversational,explanations,facts,quantity facts,quantity
TH Köln - Information Retrieval Research Group,experiments,ir experiments,ir,living,systemoriented,lilas,reproducibility,systemoriented ir,schema ir,academic search
TIB - Forschungsgruppe Visual Analytics,multimodal,geolocation,search,news,web,international workshop,international,workshop,image,layouts
Universität Bonn - Data Science & Intelligent Systems (DSIS) research group,knowledge,knowledge graph,graph,international workshop,international,openstreetmap,worldscale geographic,worldscale,worldkg worldscale,worldkg
Universität Mannheim - Data and Web Science Group,matching,detection,using,sentiment stance,learning product,using sentiment,recommender systems,news recommender,bias news,systems using
Universität Regensburg - Lehrstuhl für Informationswissenschaft,snippets,featured snippets,featured,influence,investigating influence,attitudes,privacy,user attitudes,snippets user,influence featured
Webis Group,overview,argument,touché,overview touché,retrieval,argument retrieval,questions,retrieval overview,information retrieval,pan
