In [285]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
french_stopwords = list(fr_stop)
from scipy import stats
import math
import string

In [358]:
def Tokenizer(randomstring,french_stopwords=french_stopwords):
    translator = str.maketrans('', '', string.punctuation)
    randomstring = randomstring.translate(translator)
    randomstring = randomstring.lower()
    randomstring = " ".join(randomstring.split())
    words = randomstring.split(" ")
    words = [item for item in words if item not in french_stopwords]
    words = [item for item in words if len(item)>2]
    return words

In [359]:
def SentenceCompteur(compteur,words,weight=1):
    for w in words:
        compteur[w] = compteur.get(w,0) + weight
    return compteur

def BooleanCorpusCompteur(Corpus):
    compteur = {}
    for doc in Corpus:
        compteur = SentenceCompteur(compteur,doc)
    return compteur

In [360]:
def RemoveTopBottomFrequentWords(Compteur,RemoveTopPct,RemoveBottomPct):

    CompteurList = list(Compteur.items())
    CompteurList.sort(key=lambda a:a[1],reverse = True)

    NTop = round(RemoveTopPct * len(CompteurList))
    NBottom = round(RemoveBottomPct * len(CompteurList))

    StartingIndex = NTop
    EndingIndex = len(CompteurList)-1-NBottom

    CompteurList = [item for iditem,item in enumerate(CompteurList) if iditem>=StartingIndex and iditem<=EndingIndex]
    res = {}
    for item in CompteurList:
        res[item[0]] = item[1]
    
    return res

In [361]:
def BuildDocumentsTouched(V,CleanedCorpus):
    
    DocumentsTouched = {}
    for w in V:
        for d in CleanedCorpus:
            if w in d:
                DocumentsTouched[w] = DocumentsTouched.get(w,0) + 1        
    
    return DocumentsTouched

In [362]:
def BuildIDF(V,CleanedCorpus,DocsTouched):
    
    L = []
    for idw,w in enumerate(V):
        for idd,d in enumerate(CleanedCorpus):
            if w in d:
                idf = math.log(D/DocsTouched[w])
                obj = {"DocumentID" : idd, "Word" : w , "TFIDF" : idf}
                L.append(obj)
    
    return L

In [363]:
def ConvertIDFToDataFrame(idfdic):
    df = pd.DataFrame(idfdic)
    df = df.sort_values(by=["DocumentID","TFIDF"],ascending = False).reset_index(drop=True)
    df = df.groupby('DocumentID').cumcount().to_frame(name = "rank").merge(df,left_index=True,right_index=True)
    df["rank"] = df["rank"] + 1
    return df

In [292]:
def ComputeTFIDF(Corpus,RemoveTopPct,RemoveBottomPct):

    # Cleaning
    CleanedCorpus = [Tokenizer(doc) for doc in Corpus]

    # Nombre de documents
    D = len(CleanedCorpus)

    # Creation du Compteur
    Compteur = BooleanCorpusCompteur(CleanedCorpus)

    # Remove too mich frequent and infrequent words
    Compteur = RemoveTopBottomFrequentWords(Compteur,RemoveTopPct,RemoveBottomPct)

    # Création du  Vocabulaire
    V = list(Compteur.keys())

    # Creation du documents touched
    DocsTouched = BuildDocumentsTouched(V,CleanedCorpus)

    # Creation idf
    idfdic = BuildIDF(V,CleanedCorpus,DocsTouched)

    # idf to dataframe
    TFIDFdf = ConvertIDFToDataFrame(idfdic)

    return TFIDFdf

In [495]:
def UpdateCompteurDic(OriginalDic,AddDic):
    for k,v in AddDic.items():
        OriginalDic[k] = OriginalDic.get(k,0) + v
    return OriginalDic

# test

In [366]:
doca = "je vais GRau R tennis ce  matin a 14h"
docb = "je vais à la canGine à midi oRHTu à la cantine oui à 14h"
docc = "tu vas au foot ou tVDFennis ou au tennis ce soir, on se voit à la cantine vers 14h"
ListOfDocuments = [doca,docb,docc]

RemoveTopPct = 0.1
RemoveBottomPct = 0.1

tfidfdf = ComputeTFIDF(Corpus,RemoveTopPct,RemoveBottomPct)

TypeError: 'type' object is not iterable

In [294]:
tfidfdf

Unnamed: 0,rank,DocumentID,Word,TFIDF
0,1,2,foot,1.098612
1,2,2,tvdfennis,1.098612
2,3,2,soir,1.098612
3,4,2,tennis,0.405465
4,5,2,cantine,0.405465
5,1,1,cangine,1.098612
6,2,1,midi,1.098612
7,3,1,orhtu,1.098612
8,4,1,oui,1.098612
9,5,1,cantine,0.405465


In [582]:
doca = "je vais GRau R tennis ce  matin a 14h"
docb = "je vais à la canGine à midi oRHTu à la cantine oui à 14h"
docc = "tu vas au foot ou tVDFennis ou au tennis ce soir, on se voit à la cantine vers 14h"
ListOfDocuments = [doca,docb,docc]
ListOfIDs = [55,2,94]
TweetsDataFrame = pd.DataFrame((ListOfIDs,ListOfDocuments)).T.rename(columns={0:"TWEETID",1:"TWEETCONTENT"})
TweetsDataFrame

Unnamed: 0,TWEETID,TWEETCONTENT
0,55,je vais GRau R tennis ce matin a 14h
1,2,je vais à la canGine à midi oRHTu à la cantine...
2,94,tu vas au foot ou tVDFennis ou au tennis ce so...


In [583]:
df = pd.concat((TweetsDataFrame,TweetsDataFrame),axis = 0)

In [585]:
df.groupby("TWEETCONTENT")["TWEETID"].sum().reset_index()

Unnamed: 0,TWEETCONTENT,TWEETID
0,je vais GRau R tennis ce matin a 14h,110
1,je vais à la canGine à midi oRHTu à la cantine...,4
2,tu vas au foot ou tVDFennis ou au tennis ce so...,188


In [419]:
def DeplyrDF(TweetsDataFrame):

    L = []
    for i,row in TweetsDataFrame.iterrows():
        tweetid = row["TWEETID"]
        tweetcontent = row["TWEETCONTENT"]
        tweetcontent = pd.Series(tweetcontent)
        tempdf = tweetcontent.to_frame(name = "word")
        tempdf["tweetid"] = tweetid
        L.append(tempdf)

    res = pd.concat(L,axis=0)
    res.reset_index(drop=True,inplace=True)

    return res

In [791]:
class Corpus():
    
    # Initialisation de l'objet
    def __init__(self):
        self.WORDS2TWEETS = pd.DataFrame()
        self.DOCSTOUCHED = pd.DataFrame()
        self.COMPTEUR = {}
        self.BATCH_WORDS2TWEETS = pd.DataFrame()
        self.BATCH_DOCSTOUCHED = pd.DataFrame()
        self.BATCH_COMPTEUR = {}
        self.BATCH_V = []
        self.DocsRepresentation = pd.DataFrame()
    
    # Evaluation du nombre de documents que contient le corpus
    def EvaluateD(self):
        if len(self.WORDS2TWEETS)>0:
            res = len(np.unique(self.WORDS2TWEETS.tweetid))
        else:
            res = 0
        return res
    
    # Build 3 intermediate variables before DocsRepresentation
    def AddCorpus(self,TweetsDataFrame,rank=2):
        
        tweetsdf = TweetsDataFrame.copy()
        if len(self.WORDS2TWEETS)>0:
            a = self.WORDS2TWEETS.tweetid.unique()
            b = tweetsdf.TWEETID
            fil = ~b.isin(a)
            tweetsdf = tweetsdf[fil]
            
            
        if len(tweetsdf)>0:
            tweetsdf.TWEETCONTENT = tweetsdf.TWEETCONTENT.map(lambda a : Tokenizer(a))
            Words2TweetsDF = DeplyrDF(tweetsdf)
            CompteurDic = BooleanCorpusCompteur(tweetsdf.TWEETCONTENT.tolist())
            V = list(CompteurDic.keys())
            DocsTouched = BuildDocumentsTouched(V,tweetsdf.TWEETCONTENT.tolist())
            DocsTouched = pd.DataFrame(data=DocsTouched,index=[0]).T.reset_index().rename(columns = {"index":"word",0:"f"})

            self.BATCH_WORDS2TWEETS = Words2TweetsDF
            self.BATCH_DOCSTOUCHED = DocsTouched
            self.BATCH_COMPTEUR = CompteurDic
            self.BATCH_V = V
        else:
            self.BATCH_WORDS2TWEETS = pd.DataFrame()
            self.BATCH_DOCSTOUCHED = pd.DataFrame()
            self.BATCH_COMPTEUR = {}
            self.BATCH_V = []
            
        
        # Updating part
        self.COMPTEUR = UpdateCompteurDic(self.COMPTEUR,self.BATCH_COMPTEUR)
        self.WORDS2TWEETS = pd.concat((self.WORDS2TWEETS,self.BATCH_WORDS2TWEETS),axis=0)
        if len(self.DOCSTOUCHED)>0:
            fil = self.DOCSTOUCHED.word.isin(pd.Series(V))
            nepastoucher = self.DOCSTOUCHED[~fil]
            amodifier = self.DOCSTOUCHED[fil]
            amodifier = pd.concat((amodifier,self.BATCH_DOCSTOUCHED),axis = 0)
            amodifier = amodifier.groupby("word")["f"].sum().reset_index()
            self.DOCSTOUCHED = pd.concat((amodifier,nepastoucher),axis = 0)
        else:
            self.DOCSTOUCHED = self.BATCH_DOCSTOUCHED
        
        
        return None
    
    
    def SaveOnDisk(self):
        PickleDump("tfidf_compteur.pkl",self.COMPTEUR)
        PickleDump("tfidf_docstouched.pkl",self.DOCSTOUCHED)
        PickleDump("tfidf_words2tweets.pkl",self.WORDS2TWEETS)
        PickleDump("tfidf_DocsRepresentation.pkl",self.D)
        return None
    
    def LoadData(self):
        file_exist = False
        if file_exist:
            self.DOCSTOUCHED = PickleLoad("tfidf_docstouched.pkl")
            self.WORDS2TWEETS = PickleLoad("tfidf_words2tweets.pkl")
            self.COMPTEUR = PickleLoad("tfidf_compteur.pkl")
            self.DocsRepresentation = PickleLoad("tfidf_DocsRepresentation.pkl")
        else:
            print("No data saved")
            
            
    
            
            
    def BuildDocsRepresentation(self):
        
        if len(self.BATCH_V)==0:
            return None
        
        if len(self.DocsRepresentation)>0:
            fil = self.DocsRepresentation.word.isin(pd.Series(self.BATCH_V))
            DocsRepresentationKeep = self.DocsRepresentation.copy()[~fil]
            DocsRepresentationKeep["D"] = self.EvaluateD()
        else:
            DocsRepresentationKeep = pd.DataFrame()
        #print(DocsRepresentationKeep.shape)
            
            
        fil = self.DOCSTOUCHED.word.isin(pd.Series(self.BATCH_V))
        temptouched = self.DOCSTOUCHED.copy()[fil]
        #print(temptouched.shape)
        
        
        fil = self.WORDS2TWEETS.word.isin(pd.Series(self.BATCH_V))
        tempwords2 = self.WORDS2TWEETS[fil]
        #print(tempwords2.shape)
        
        
        
        toadd = temptouched.merge(tempwords2,on="word")
        toadd["D"] = self.EvaluateD()
        #print(toadd.shape)
        #print(toadd)
        
        
        solution = pd.concat((toadd,DocsRepresentationKeep),axis = 0, sort = True)
        solution["idf"] = (solution.D / solution.f).map(lambda a : math.log(a))
        solution = solution
        solution.reset_index(drop = True,inplace = True)
        solution.sort_values(by = ["tweetid","idf"],ascending=False,inplace=True)
        solution['Rank'] = solution.groupby(["tweetid"]).cumcount()+1
        self.DocsRepresentation = solution
        
        
        #print(self.DocsRepresentation.shape)
        
        
        return None
        
        
        
        
        
        
        

In [792]:
corpus = Corpus()

In [793]:
corpus.LoadData()

No data saved


In [794]:
doca = "je vais GRau R tennis ce  matin a 14h"
docb = "je vais à la canGine à midi oRHTu à la cantine oui à 14h"
docc = "tu vas au foot ou tVDFennis ou au tennis ce soir, on se voit à la cantine vers 14h"
ListOfDocuments = [doca,docb,docc]
ListOfIDs = [55,2,94]
TweetsDataFrame = pd.DataFrame((ListOfIDs,ListOfDocuments)).T.rename(columns={0:"TWEETID",1:"TWEETCONTENT"})
TweetsDataFrame

Unnamed: 0,TWEETID,TWEETCONTENT
0,55,je vais GRau R tennis ce matin a 14h
1,2,je vais à la canGine à midi oRHTu à la cantine...
2,94,tu vas au foot ou tVDFennis ou au tennis ce so...


In [795]:
corpus.AddCorpus(TweetsDataFrame)

In [796]:
corpus.BuildDocsRepresentation()

In [None]:
corpus.SaveOnDisk()

In [798]:
corpus.DocsRepresentation

Unnamed: 0,D,f,tweetid,word,idf,Rank
13,3,1,94,foot,1.098612,1
14,3,1,94,tvdfennis,1.098612,2
15,3,1,94,soir,1.098612,3
16,3,1,94,voit,1.098612,4
2,3,2,94,tennis,0.405465,5
11,3,2,94,cantine,0.405465,6
6,3,3,94,14h,0.0,7
0,3,1,55,grau,1.098612,1
3,3,1,55,matin,1.098612,2
1,3,2,55,tennis,0.405465,3


In [799]:
doca = "les usa et la chine en guerre"
docb = "les carottes sont cuites à midi"
docc = "bientot les elections, apres le tennis"
ListOfDocuments = [doca,docb,docc]
ListOfIDs = [105,22,94]
TweetsDataFrame = pd.DataFrame((ListOfIDs,ListOfDocuments)).T.rename(columns={0:"TWEETID",1:"TWEETCONTENT"})
TweetsDataFrame

Unnamed: 0,TWEETID,TWEETCONTENT
0,105,les usa et la chine en guerre
1,22,les carottes sont cuites à midi
2,94,"bientot les elections, apres le tennis"


In [800]:
corpus.AddCorpus(TweetsDataFrame)

In [801]:
corpus.BuildDocsRepresentation()

In [None]:
corpus.SaveOnDisk()

In [802]:
x = corpus.DocsRepresentation

In [803]:
x

Unnamed: 0,D,Rank,f,idf,tweetid,word
1,5,1,1,1.609438,105,chine
3,5,2,1,1.609438,105,guerre
6,5,3,1,1.609438,105,usa
7,5,1,1,1.609438,94,foot
8,5,2,1,1.609438,94,tvdfennis
9,5,3,1,1.609438,94,soir
10,5,4,1,1.609438,94,voit
11,5,5,2,0.916291,94,tennis
12,5,6,2,0.916291,94,cantine
13,5,7,3,0.510826,94,14h
