In [1]:
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec

## Partie I :

### 1) 

In [2]:
def preprocessing(doc) :
    doc = doc.lower()
    doc = re.sub(r'[^\w\s]', '', doc)
    words = doc.split()
    return words

In [3]:
reviews = [
    "Spider man est un film incroyable !",
    "Spider man est le meilleur film !",
    "Spiderman est TELLEMENT génial."
]

In [4]:
# reviews = "Spider man est un film incroyable ! Spider man est le meilleur film ! Spiderman est TELLEMENT génial."
# print("Vocab: ", set(preprocessing(reviews)))

In [4]:
combined_vocab = set()
for review in reviews:
    combined_vocab.update(preprocessing(review))
print("Vocab: ", combined_vocab)

Vocab:  {'man', 'un', 'tellement', 'film', 'génial', 'meilleur', 'le', 'est', 'spiderman', 'spider', 'incroyable'}


### 2) 

In [5]:
def calculBOW(doc):
    words = preprocessing(doc)
    bow = Counter(words)
    return bow

### 3) 

In [6]:
for doc in reviews :
    print(calculBOW(doc))

Counter({'spider': 1, 'man': 1, 'est': 1, 'un': 1, 'film': 1, 'incroyable': 1})
Counter({'spider': 1, 'man': 1, 'est': 1, 'le': 1, 'meilleur': 1, 'film': 1})
Counter({'spiderman': 1, 'est': 1, 'tellement': 1, 'génial': 1})


### 4)

In [7]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)

In [8]:
print(vectorizer.get_feature_names_out())

['est' 'film' 'génial' 'incroyable' 'le' 'man' 'meilleur' 'spider'
 'spiderman' 'tellement' 'un']


### 5)

In [9]:
count_array = X.toarray()
df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())
df

Unnamed: 0,est,film,génial,incroyable,le,man,meilleur,spider,spiderman,tellement,un
0,1,1,0,1,0,1,0,1,0,0,1
1,1,1,0,0,1,1,1,1,0,0,0
2,1,0,1,0,0,0,0,0,1,1,0


### 6)

In [10]:
vectorizer_bi = CountVectorizer(ngram_range=(1, 2))
X_bi = vectorizer_bi.fit_transform(reviews)
feature_names_bi = vectorizer_bi.get_feature_names_out()

df = pd.DataFrame(data=X_bi.toarray(), columns=feature_names_bi)
df

Unnamed: 0,est,est le,est tellement,est un,film,film incroyable,génial,incroyable,le,le meilleur,...,meilleur,meilleur film,spider,spider man,spiderman,spiderman est,tellement,tellement génial,un,un film
0,1,0,0,1,1,1,0,1,0,0,...,0,0,1,1,0,0,0,0,1,1
1,1,1,0,0,1,0,0,0,1,1,...,1,1,1,1,0,0,0,0,0,0
2,1,0,1,0,0,0,1,0,0,0,...,0,0,0,0,1,1,1,1,0,0


## Partie II :

### 1)

In [17]:
with open('alice.txt', 'r', encoding='utf-8') as file:
    alice_text = file.read()

In [18]:
alice_text = alice_text.replace('\n',' ')
alice_text = alice_text.lower()

In [19]:
alice_text[:200]

'the project gutenberg ebook of alice’s adventures in wonderland, by lewis carroll  this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and wi'

### 2)

In [20]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
sentences = sent_tokenize(alice_text)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [22]:
len(tokenized_sentences)

1091

In [23]:
for i in range(5):
    print(f"Phrase {i + 1}: {tokenized_sentences[i]}")

Phrase 1: ['the', 'project', 'gutenberg', 'ebook', 'of', 'alice', '’', 's', 'adventures', 'in', 'wonderland', ',', 'by', 'lewis', 'carroll', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.']
Phrase 2: ['you', 'may', 'copy', 'it', ',', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www.gutenberg.org', '.']
Phrase 3: ['if', 'you', 'are', 'not', 'located', 'in', 'the', 'united', 'states', ',', 'you', 'will', 'have', 'to', 'check', 'the', 'laws', 'of', 'the', 'country', 'where', 'you', 'are', 'located', 'before', 'using', 'this', 'ebook', '.']
Phrase 4: ['title', ':', 'alice', '’', 's', 'adventures', 'in', 'wonderland', 'author', ':', 'lewis', 'carroll', 'release', 'dat

### 3)


In [17]:
# tokenized_words = word_tokenize(alice_text)

In [25]:
common_params = {
    'min_count': 2,
    'vector_size': 100,
    'window': 5
}

In [26]:
cbow_model = Word2Vec(sentences=tokenized_sentences, sg=0, **common_params)

skipgram_model = Word2Vec(sentences=tokenized_sentences, sg=1, **common_params)

cbow_model.save('cbow_model.bin')
skipgram_model.save('skipgram_model.bin')

### 4)

In [27]:
word1 = 'better'
word2 = 'worse'
word3 = 'sun'

In [28]:
cbow_model.wv.similarity(word1, word2)

0.9834384

In [29]:
cbow_model.wv.similarity(word1, word3)

0.87874806

In [30]:
skipgram_model.wv.similarity(word1, word2)

0.96743923

In [31]:
skipgram_model.wv.similarity(word1, word3)

0.95921224

### 5)

In [32]:
chosen_word = 'dog'

In [33]:
cbow_model.wv.most_similar(chosen_word)

[('ever', 0.9796494245529175),
 ('trying', 0.97946697473526),
 ('_i_', 0.9791896343231201),
 ('i', 0.979085385799408),
 ('near', 0.9790197014808655),
 ('mouse', 0.9788962006568909),
 ('you', 0.9788196086883545),
 ('caterpillar', 0.9787501692771912),
 ('ve', 0.9787386655807495),
 ('united', 0.9787285923957825)]

In [34]:
skipgram_model.wv.most_similar(chosen_word)

[('trial', 0.9979236125946045),
 ('changed', 0.9979097247123718),
 ('used', 0.9978994727134705),
 ('lessons', 0.9978406429290771),
 ('nobody', 0.9978319406509399),
 ('fancy', 0.9977988004684448),
 ('always', 0.9977502226829529),
 ('asking', 0.9977481961250305),
 ('pleased', 0.9976670742034912),
 ('song', 0.9976446032524109)]