## Assignment 1 Kai Foerster (ID: 214288)

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup


In [2]:
# 1.1 Choose one of the sessions, and retrieve it using R or Python.

# URL of the XML data
url = "https://www.bundestag.de/resource/blob/968690/5d723616da1ea3ca054e8da604ff1004/20124-data.xml"

# Send a GET request to the server and store the response
r = requests.get(url)

soup = BeautifulSoup(r.content)



In [3]:
#1.2 Using a scraper, get a list of all the elements.

rede_elements = soup.find_all('rede')

In [4]:
# 1.3 For each element, get the name of the speaker, and a single string containing everything that they said. 
# Put this into a dataframe.

# Initialize a list to store the extracted info
data = []

for rede in rede_elements:
    # Extract the paragraph describing the speaker
    vorname_element = rede.select('vorname')
    nachname_element = rede.select('nachname')
    
    # Check if vorname and nachname elements are not empty, and extract text from them
    vorname = vorname_element[0].text if vorname_element else ""
    nachname = nachname_element[0].text if nachname_element else ""
    
    # Extract the speech paragraphs with klasse="J", "J_1", or "O"
    speech_paragraphs = rede.select('p', class_=['J', 'J_1', 'O'])
    
    # Combine all speech paragraphs into one string, skipping the first paragraph if there are more than one
    speech_text = ' '.join([p.get_text(strip=True) for p in speech_paragraphs[1:]]) 
    
    # Append the extracted information to the data list
    data.append({
        "Name": vorname + " " + nachname,
        "Speech": speech_text
    })

df = pd.DataFrame.from_dict(data)
df    

Unnamed: 0,Name,Speech
0,Christian Lindner,Frau Präsidentin! Liebe Kolleginnen und Kolleg...
1,Svenja Schulze,"Sehr geehrte Frau Präsidentin, das freut uns a..."
2,Mathias Middelberg,"Frau Präsidentin, herzlichen Dank für das Wort..."
3,Christian Lindner,"Vielen Dank, Frau Präsidentin. – Lieber Herr K..."
4,Mathias Middelberg,Auch wir finden den angebotsorientierten Ansat...
...,...,...
159,Manfred Todtenhausen,Frau Präsidentin! Liebe Kolleginnen! Liebe Kol...
160,Martina Stamm-Fibich,Sehr geehrte Frau Präsidentin! Liebe Kolleginn...
161,Dirk Brandes,"Vielen Dank, Frau Präsidentin, dass Sie die Ku..."
162,Daniela Ludwig,Frau Präsidentin! Liebe Kolleginnen und Kolleg...


In [5]:
#2.1 Choose a politician, and print the number of speeches they made in this session

# Group by 'Name', count the occurrences, and reset the index
grouped_df = df.groupby('Name').size().reset_index(name='count')

# Sort the DataFrame based on the 'count' column in descending order
sorted_df = grouped_df.sort_values(by='count', ascending=False)

# Filter the rows where 'Name' is 'Christian Lindner'
filtered_df = sorted_df[sorted_df['Name'] == 'Christian Lindner']

print(filtered_df)

                 Name  count
14  Christian Lindner     27


In [6]:
# 2.2 Print the content of the first speech by the politician you choose.

# Filter rows where 'Name' is 'Svenja Schulze'
lindner_speech = df[df['Name'] == 'Christian Lindner']

# Access the first row and the second column
element = lindner_speech.iloc[0, 1]  # Remember, Python uses zero-based indexing. lindner_speech['Speech'].iloc[0] is an alternative

print(element)

Frau Präsidentin! Liebe Kolleginnen und Kollegen! Ich will mich zunächst bei der Innenministerin, Nancy Faeser, bedanken, dass sie in der vergangenen Woche kurzfristig eingesprungen ist, als ich positiv war, eine Covid-Infektion hatte. Ich will drei Punkte nennen. Erster Punkt. Die wirtschaftliche Entwicklung in unserem Land ist unbefriedigend. Hierbei sind zum einen konjunkturelle Belastungsfaktoren, zum anderen aber auch strukturelle Defizite unserer Wettbewerbsfähigkeit, die wir seit vielen Jahren kennen, zu nennen. Die Bundesregierung geht diese entschlossen an, von A wie „Arbeitskräfte“ bis P wie „Planungs- und Genehmigungsverfahren“, die wir beschleunigen wollen. In meinem Geschäftsbereich kommen zwei wichtige Gesetzgebungsvorhaben hinzu: zum einen das Wachstumschancengesetz, mit dem wir Forschungsförderung, Investitionen und Eigenkapitalbasis stärken, sowie das Zukunftsfinanzierungsgesetz, mit dem wir den Kapitalmarktzugang insbesondere für junge und innovative Unternehmen verbe

In [19]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# Download NLTK data
nltk.download('stopwords')

#lindner_speeches = lindner_speech['Speech']
#all_speeches = ' '.join([p for p in lindner_speeches])
all_speeches = element

# Load German stop words
german_stop_words = list(stopwords.words('german'))

# Initialize TfidfVectorizer with German stop words and without tokenization of words with punctuation
vectorizer = TfidfVectorizer(stop_words=german_stop_words, token_pattern=r'\b\w+\b')
dfm = vectorizer.fit_transform(all_speeches)
vocab = vectorizer.get_feature_names_out()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaius\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: Iterable over raw text documents expected, string object received.

In [15]:
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import numpy as np

fig, ax = plt.subplots()

X = dfm.todense()

# Plot a heatmap of the dfm
ax.imshow(
    X,
    cmap = "Greys",
    norm = Normalize(vmin=0, vmax=3)
)

# Create a grid using minor ticks
ax.set_xticks(np.arange(X.shape[1])+0.5, minor=True)
ax.set_yticks(np.arange(X.shape[0])+0.5, minor=True)
ax.grid(which="minor", zorder=5)

# Set up x labels
ax.xaxis.tick_top()
ax.set_xticks(np.arange(X.shape[1]))
ax.set_xticklabels(vocab, rotation=60, ha="left", va="bottom")

# Set up y labels
ax.set_yticks(range(len(all_speeches)))
ax.set_yticklabels(all_speeches)


# Put the numbers in
for m in range(X.shape[0]):
    for n in range(X.shape[1]):
        ax.text(n, m, X[m, n], ha="center", va="center")

Error in callback <function flush_figures at 0x0000027C0FF45090> (for post_execute):


KeyboardInterrupt: 

In [17]:
vocab

array(['abgeordnet', 'absicht', 'act', 'adaquat', 'adjustment',
       'afghanistan', 'allerding', 'allgemein', 'amt', 'anerkannt',
       'angebotsseit', 'angeht', 'angekundigt', 'angeschaut', 'angesicht',
       'anheiz', 'anja', 'anlass', 'anpass', 'anstreng', 'antwort',
       'antwortzeit', 'appell', 'arbeit', 'arbeitseb', 'arbeitskraft',
       'arbeitsmarkt', 'arbeitsmarktintegration', 'arbeitsmarktzugang',
       'arbeitsplatz', 'armut', 'asylantrag',
       'asylbewerberleistungsgesetz', 'asylpaket', 'asylverfahr',
       'attraktiv', 'attraktivst', 'aufbau', 'auffass', 'aufnahm',
       'augenmerk', 'ausfuhr', 'ausgabeobergrenz', 'ausgabeprogramm',
       'ausgabeverhalt', 'ausgefuhrt', 'ausgelauf', 'ausgerechnet',
       'ausgezahlt', 'auss', 'aussengrenz', 'ausserhalb', 'ausserst',
       'ausstieg', 'austausch', 'austauschmechanism', 'auswart',
       'auswert', 'auswirk', 'auszahl', 'auszuzahl', 'baldmog',
       'bandbreit', 'baustein', 'beabsichtigt', 'beacht', 'beachte

In [159]:
# 2.3 Process the list of speeches into a TFIDF matrix. What are the highest scoring terms in this matrix for the
# first speech by the politician you have chosen?
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


lindner_speeches = lindner_speech['Speech']
all_speeches = ' '.join([p for p in lindner_speeches])


def porter_tokenizer(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in word_tokenize(text)]

vectorizer = CountVectorizer(stop_words= "german", tokenizer= porter_tokenizer)
dfm = vectorizer.fit_transform(all_speeches)
vocab = vectorizer.get_feature_names_out()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaius\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


InvalidParameterError: The 'stop_words' parameter of CountVectorizer must be a str among {'english'}, an instance of 'list' or None. Got 'german' instead.

In [21]:
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import GermanStemmer
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Assuming svenja_speech is your DataFrame
#speech_list = lindner_speech['Speech'].tolist()

# Load German stop words
german_stop_words = set(stopwords.words('german'))

# Initialize German stemmer
stemmer = GermanStemmer()

# Define a function for preprocessing
def preprocess(text):
    # Tokenize and remove punctuation
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    
    # Remove stop words and perform stemming
    return ' '.join(stemmer.stem(word) for word in words if word.lower() not in german_stop_words)

# Preprocess the text
processed_texts = [preprocess(text) for text in speech_list]

# Compute TF-IDF
vectorizer = TfidfVectorizer()
dfm= vectorizer.fit_transform(processed_texts)
vocab = vectorizer.get_feature_names_out()

# Extract top features based on tf-idf scores
features = vectorizer.get_feature_names_out()
sums = dfm.sum(axis=0)

# Create a dictionary with features and their corresponding tf-idf scores
data = []
for col, term in enumerate(features):
    data.append((term, sums[0, col]))

ranking = pd.DataFrame(data, columns=['term', 'rank'])
sorted_ranking = ranking.sort_values('rank', ascending=False)

# Display top 11 features
print(sorted_ranking.head(11))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaius\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaius\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: empty vocabulary; perhaps the documents only contain stop words