In [1]:
import pandas as pd
import numpy as np
import csv 
import matplotlib.pyplot as plt
import spacy
import math
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
import re
nltk.download('punkt')

path = '/Users/alexandrequeant/Desktop/Statapp/Corp_HouseOfCommons_V2_2010.csv'

pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexandrequeant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [36]:
# **Fonctions de traitement**
def read_input(path_1, encod, **kwargs):
    dtype_values = kwargs.get('dtype_values', None)
    df = pd.read_csv(path, sep=';', encoding=encod, dtype=dtype_values)
    return df
dtypes = {
    'party.facts.id' : str,
    'date': object,
    'agenda': object,
    'speechnumber': int,
    'speaker': object,
    'party': object,
    'party.facts.id': object,
    'chair': bool,
    'terms': int,
    'text': object,
}

def read_HouseOfCommons(keep_date):
    '''
    Objectif:
    Cette fonction permet de lire la base des parlementaires 
    elle renvoie le dataFrame preprocessé
    Arguments:
    keep_date -> bool qui détermine si on supprime la colonne keep_date
    '''
    df = read_input(path, encod='ISO-8859-1', dtype_values=dtypes)
    if keep_date:
        df.drop(columns=['Unnamed: 0', 'iso3country', 'parliament', 'party.facts.id', 'speechnumber', 'chair', 'terms'], inplace=True)
    else:
        df.drop(columns=['Unnamed: 0', 'iso3country', 'parliament', 'party.facts.id', 'speechnumber', 'chair', 'terms', 'date'], inplace=True)
    df.rename(columns=
        {'speaker': 'Speaker'},
        inplace=True
    )
    return df 

def keep_rd_lines(df, n):
    '''
    Objectif:
    Cette fonction renvoie n random lines du df  
    Arguments:
    df -> le DataFrame
    n -> le nb de lignes qu'on souhaite garder
    '''
    df = df.sample(frac=1).reset_index(drop=True)
    df = df.head(n)
    return df

def keep_parties(df, list_of_parties):
    '''
    Objectif:
    Cette fonction le df en ne gardant que les partis politiques choisis  
    Arguments:
    df -> DataFrame : le DataFrame avec les speechs
    list_of_parties -> list : liste des partis politiques qu'on souhaite garder
    '''
    return df.loc[df['party'].isin(list_of_parties)]
import spacy
import string
from spacy.lang.en import English


putback = ['prime', 'officials', 'security', 'news', 'working', 'games', 'jobs', 'campaign', 'services',
'civil', 'economic', 'information', 'political', 'election', 'court', 'office', 'vote', 'trump', 'control', 'job', 'price',
'donald trump', 'chinese', 'problems', 'concerns', 'minister', 'nation', 'policy', 'data', 'indian', 'congress',
'president', 'network', 'american', 'accused', 'government', 'money', 'investigation', 'facebook', 
'success', 'prices', 'twitter', 'book', 'politics',  'justice', 'claims', 'russia', 'law', 'technology',
'content', 'union', 'european', 'workers']

def construct_list_stopwords(list_putback_words=putback):
  '''
  Objectif:
  On importe la liste des stopwords :
  On a donc un dataframe des stopwords. On en tire une liste simple des stopwords, où l'on remet cependant des stopwords jugés significatifs, comme expliqué pour le traitement des journaux :
  
  Arguments:
  list_putback_words -> list des mot jugés finalement signifcatifs 
  '''
  df_stopwords = pd.read_csv('/Users/alexandrequeant/Desktop/Statapp/sw1k.csv',
                names=['word', 'frequency', 'presence', 'doc_size_sum', 'type'],
                encoding='latin-1').drop(index=0)
  stopwords = df_stopwords['word'].unique()
  stopwords = list(set(stopwords) - set(list_putback_words))
  return stopwords

nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes(["tagger", "parser"])
nltk.download('stopwords')
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words('english'))

def extract_bigrams(n_grams):
    bigrams = []
    for i in range(len(n_grams)-1):
        bigram = f"{n_grams[i]} {n_grams[i+1]}"
        bigrams.append(bigram)
    return bigrams

def clean(text, list_stopwords, gram):
  '''
  Objectif:
  On va cleen le text
  On utilise la librairie SpaCy comme pour le traitement des journaux, et on enlève les nombres et la ponctuation avec la méthode translate. 
  
  Arguments:
  text -> str: le texte sur lequel on fait le processing
  list_stopwords -> list: mots à écarter !
  '''
  text = str(text).lower()
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = text.translate(str.maketrans('', '', string.digits)) 
  # Tokenisation
  tokens = word_tokenize(text)
  # Enlever les caractères qui ne sont pas des lettres
  tokens = [re.sub('[^a-zA-Z]', '', token) for token in tokens]
  # Stemming
  stemmer = PorterStemmer()
  tokens_stemmed = [stemmer.stem(token) for token in tokens]
  filtered_words = [word for word in tokens_stemmed if not word.lower() in list(english_stopwords )+ list(stop_words)]
  # Transformation en bigramme
  if gram == 'bigram':
    filtered_words = extract_bigrams(filtered_words)
  return filtered_words

technology=['technology','innovat','computer','high tech|high-tech','science','engineering']
consumer_protection=['privacy','data leak','leak','fake news',' safety','decept','defective','hack']
firms=['google','alphabet','apple','facebook','meta','amazon','microsoft']
products=['chrome','incognito','youtube','nexus','pixel','google drive','gmail','glass','street view','buzz','fitbit',
 'maps', 'doodle','play','translate','search', 'google news','nest hub','xl','nest','chromecast','stadia','hub',
 'marshmallow','lollipop','cloud','waymo','earth','engine',

'apple pay','apple watch','iphone','ipad','ipod','iwatch','macbook','macbook pro', 'macbook air','mac',
 'imac','airpods','ios','siri','icloud','apple tv','apple music','app store', 'safari','x','app','apple store',
 'xr', 'xs', 'se','iphones','itunes','ibook','plus','pro','max','mini','os','airtag','airtags','arcade','homepod',
'keynote','ipados','id','foxconn','facetime','beat','stalk',

'messenger','instagram', 'whatsapp','page','feed','oculus',

'prime', 'kindle',
'publishing','amazon prime','amazon drive','amazon video','amazon business','amazon web service',
'amazon cloud', 'alexa','echo dot','echo','dot', 'delivery', 'amazon uk','unlimited', 'episode','foods','grocery', 
'grand tour','grand','tour','viking', 'vikings','argo','argos','macmillan', 'dvd','clarkson','lord','ring','hair','skin','vacuum',
'pre','beer','drake','spark','kart','dog','twitch','cat','xo','matthew','stafford','ratchet','clank',
'swagway','album','mouse','showbiz','beauty','guardian','batman','arkham','gc','hair','skin','shirt',
'lovefilm','mirzapur','cast','audio','drama','movie','jack ryan','actor','character','lucifer','outlander',
'premier','super mario','sky','channel','voyage',


'windows','window','xp','surface','xbox','studio','microsoft office', 'office','word','cortana', 'surface pro','teams',  'playstation',
'microsoft edge', 'edge', 'gear','outlook','halo','skype','kinect','internet explorer','explorer','ie','bing','xcloud','hololens',
'forza','ori','scarlett','scorpio','wordperfect','valhalla','onedrive','games gold','lumia','azure',
'assassin creed','assassin','creed','minecraft','yammer','warcraft','tay']



ceos=['sundar','pichai','eric','schmidt','steve jobs','tim cook','mark zuckerberg','andy jassy','jeff','bezos','satya', 'nadella','bill gates',
      'gates','steve job','steve','tim', 'cook','zuckerberg','ceo','tim cook ','steve ballmer','ballmer','elop',
      'schiller','fadell','phil spencer','spencer','mcspirit','sandberg','paul','allen','larry hryb','hryb']
      

types=['tablet','mobile', 'laptop', 'pc', 'computer', 'desktop','smartphone', 'smartwatch', 'search engine', 'software','hardware',
               'machine', 'browser','ebook', 'book',  'reader',  'console', 'headphone', 'earbud','bud','store','music',
              'gaming', 'operating','streaming','title','chatbot']



competitor=['samsung', 'galaxy',  'twitter','tiktok', 'switch','sony', 'asos', 'activision blizzard', 'activision','blizzard',
            'nintendo','snes', 'netflix','android','yahoo','nokia','huawei','motorola','htc','blackberry','oppo','oneplus','rim','symbian','bbc','morrison','spotify'] 


configue=['device','feature','battery','screen','sound','gb','g','k','mm','chip','processor','design','display','touch','ram',
          'inch','keyboard','camera','handset','speaker','button','touchscreen','storage', 'data']


celebrity=['dubost','neymar','amanda','beyonce','blur','richard','hammond','ranj','jeremy clarkson', 'jeremy','momoa',
           'jared','aniston','smith','kim','tony','tom','sophie','oasis','trio','sharon','betty','raoul','moat','lauren','andrew',
           'samuel gibbs','samuel','gibbs','van','gaal']

topics = celebrity + configue + competitor + types + ceos + products + firms + consumer_protection + technology

def process_list_BigTech_words(topics):
  '''
  Objectifs:
  Obtient une liste cleen des topics BigTech traités
  Arguments:
  topics -> list : liste des topics de la bigTech 
  '''
  string_of_topics = ' '.join(topics)
  stemmer = SnowballStemmer(language='english')
  string_of_topics = stemmer.stem(string_of_topics)
  list_stem_topics = string_of_topics.split(' ')
  return list_stem_topics

def lines_to_keep(titre, liste_big_tech):
  '''
  Objectifs:
  prend en input le titre du speech
  retourne un booleen qui indique si le speech est en lien avec le domaine de la big tech
  Arguments:
  titre -> list : titre du speech étudié
  liste_big_tech -> set : set des mots en lien avec la BigTech
  '''
  if len(set(titre) & liste_big_tech) > 0:
      return True
  return False

def keep_Bigtech_speeches(df, list_stem_topics):
  '''
  Objectifs:
  prend en input le DataFrame des inputs 
  renvoie le df contenant uniquement les lignes contenant des speechs en lien avec la BigTech
  Arguments:
  df -> DataFrame : Dataframe des inputs
  list_stem_topics -> list : liste des mots en lien avec la BigTech
  '''
  set_stem_topics = set(list_stem_topics)
  df['lines_to_keep'] = df['agenda'].apply(lines_to_keep, args=(set_stem_topics,))
  df = df.loc[df['lines_to_keep']]
  df.drop(columns=['agenda', 'lines_to_keep'], inplace=True)
  return df 

def count_freqs(df, party):
  '''
  Objectifs:
  prend en input le DataFrame
  renvoie le df des fréquences pour un parti donné
  Arguments:
  df -> DataFrame : Dataframe 
  party -> str : le parti politique pour lequel on souhaite les freqs 
  '''  
  aux = df[['party', 'text']] 
  list_of_words = pd.Series(aux.groupby(by=['party']).sum().loc[party, 'text'])
  freq_df = pd.DataFrame(list_of_words.value_counts()) #, columns=[f'freq_{party}'])
  freq_df = freq_df.reset_index().rename(columns={'index':"words"})
  return freq_df

def merge_freq(df_1, df_2):
  '''
  Objectifs:
  permet de faire le merge entre les freq_df des différents partis
  Arguments:
  df_1 -> DataFrame : Dataframe 
  df_2 -> DataFrame : Dataframe 
  '''  
  df_freqs = pd.merge(
    df_1,
    df_2,
    how='outer',
    on=['words'],
  )
  return df_freqs


def count_liste(list, mot):
    '''
    Objectifs:
    calcule la fréquence de mot dans la list
    Arguments:
    list -> list : la liste qu'on étudie 
    mot : le mot dont on veut calculer la fréquence
    '''
    if type(list) == int : 
      return 0
    return list.count(mot)

def selected_words(df_freqs):
    return df_freqs['words'].unique()

def construct_df_reg(df, df_freqs, list_of_words):
    '''
    Objectifs:
    Construire le df qui va nous aider à faire nos régressions 
    Arguments:
    df_freqs -> DataFrame : le df des freqs
    '''
    for word in list_of_words:
        df[f'{word}'] = df['text'].apply(count_liste, args=(word,))
    return df 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexandrequeant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
# On construit la liste des mots jugés non significatifs
stop_words = construct_list_stopwords()

In [38]:
df_Guardian = pd.read_csv('/Users/alexandrequeant/Desktop/Statapp/newSplit_topics_bigfive_theGuardian_with_sentibert_post2010.csv')
df_DE = pd.read_csv('/Users/alexandrequeant/Desktop/Statapp/new_bigfive_DailyExpress_with_sentibert_post2010.csv')

In [39]:
def treat_1(df, party):
  df = df[['author', 'fulltext']]
  if party == 'Guardian' :
    df['party'] = 'Lab'
  if party == 'DE' :
    df['party'] = 'Con'
  df.rename(columns=
          {'author' : 'Speaker', 'fulltext':'text'}, inplace=True)
  return df

def treat_2(df_1,df_2):
  df = pd.concat([df_1, df_2])
  df.dropna().reset_index(drop=True)
  return df

def treat_3(df):
  df['text'] = df['text'].apply(clean, args=(stop_words,'bigram'))
  return df 

In [40]:
df_Guardian = treat_1(df_Guardian, 'Guardian')
df_DE = treat_1(df_DE, 'DE')
df_newspaper = treat_2(df_Guardian, df_DE)
df_newspaper = df_newspaper.sample(frac=1).reset_index(drop=True)
df_newspaper = df_newspaper.head(100)
df_newspaper = treat_3(df_newspaper)
df_newspaper_treated = df_newspaper
df_newspaper_treated = df_newspaper_treated.groupby(by=['Speaker', 'party']).sum().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['party'] = 'Lab'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['party'] = 'Con'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

In [41]:
len(df_newspaper_treated)

55

In [42]:
df_newspaper_treated

Unnamed: 0,Speaker,party,text
0,By Aaron Brown,Con,"[googl initi, initi remind, remind favourit, f..."
1,By Alex Davies,Con,"[fan grand, grand tour, tour wait, wait episod..."
2,By Amani Hughes,Con,"[chemist celebr, celebr hi, hi th, th birthday..."
3,By Carly Read,Con,"[wa facebook, facebook appear, appear cross, c..."
4,By Charlie Moloney,Con,"[despit coverag, coverag brexit, brexit publis..."
5,By Colin Bateman in Perth,Con,"[con reveal, reveal tech, tech guru, guru stev..."
6,By David Dawkins,Con,"[facebook actual, actual finish, finish green,..."
7,By David Snelling,Con,"[appl , iphon, iphon x, x arriv, arriv store,..."
8,By Dion Dassanayake,Con,"[updat googl, googl spokesperson, spokesperson..."
9,By Emma Nolan,Con,"[ha arriv, arriv droughtland, droughtland much..."


In [43]:
aux =  df_newspaper_treated[['party', 'text']]

In [44]:
party = 'Lab'

list_of_words = pd.Series(aux.groupby(by=['party']).sum().loc['Lab', 'text'])
freq_df = pd.DataFrame(list_of_words.value_counts(), columns=[f'freq_{party}']) #c'est donc cette ligne qui ne marche pas 
list_of_words.value_counts() #ça marche
pd.DataFrame(list_of_words.value_counts())

Unnamed: 0,count
,36
compani,21
facebook,16
app store,15
fake news,13
...,...
legitimaci alibaba,1
alibaba aliyun,1
aliyun exploit,1
exploit natur,1


In [45]:
df_freqs_Con = count_freqs(df_newspaper_treated, 'Con')
df_freqs_Lab = count_freqs(df_newspaper_treated, 'Lab')

In [47]:
df_freqs = merge_freq(df_freqs_Con_work, df_freqs_Lab_work)

NameError: name 'df_freqs_Con_work' is not defined