In [87]:
!pip install pyspark
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 5.2 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=c5ed370e8c95d7aa9ad3401f06a411dae8c165e1296d296ccc6db4f47201eef8
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [2]:

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf,lower
from pyspark.sql.types import StringType,IntegerType,FloatType,LongType
from pyspark.sql.types import StructType

spark = SparkSession.builder\
        .master("local")\
        .appName("RECOMMENDATION SYSTEM")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()


spark

In [3]:

schema = StructType() \
      .add("Course_ID",IntegerType(),True) \
      .add("Course Title",StringType(),True) \
      .add("Course Organisation",StringType(),True) \
      .add("Course Certification Type",StringType(),True) \
      .add("Rating",FloatType(),True) \
      .add("course_difficulty",StringType(),True) \
      .add("Student Enrolled",StringType(),True) 
      
df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/content/coursea_data.csv")


In [55]:
df_with_schema.show()

+---------+--------------------+--------------------+-------------------------+------+-----------------+----------------+
|Course_ID|        Course Title| Course Organisation|Course Certification Type|Rating|course_difficulty|Student Enrolled|
+---------+--------------------+--------------------+-------------------------+------+-----------------+----------------+
|      134|(ISC)² Systems Se...|              (ISC)²|           SPECIALIZATION|   4.7|         Beginner|            5.3k|
|      743|A Crash Course in...|University of Pen...|                   COURSE|   4.7|     Intermediate|             17k|
|      874|A Crash Course in...|Johns Hopkins Uni...|                   COURSE|   4.5|            Mixed|            130k|
|      413|A Law Student's T...|     Yale University|                   COURSE|   4.7|            Mixed|             91k|
|      635|A Life of Happine...|Indian School of ...|                   COURSE|   4.8|            Mixed|            320k|
|      661|ADHD: Everyda

In [4]:
#removed punctuations
from pyspark.sql.functions import udf

import string
regular_punct = list(string.punctuation)
extra_punct = [
    ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
    '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
    '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
    '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
    '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
    '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
    '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
    'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
    '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
    '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤','⌨','☝']

all_punct = list(set(regular_punct + extra_punct))

def remove_punctuation(text):
    for punc in all_punct:
        if punc in text:
            text = text.replace(punc, ' ')
    return (text.strip().lower())
    

DF_string_remover = udf(lambda m: remove_punctuation(m))

df_with_schema=df_with_schema.withColumn("Course Title",DF_string_remover("Course Title"))
df_with_schema=df_with_schema.withColumn("Course Organisation",DF_string_remover("Course Organisation"))
df_with_schema.show(5)

+---------+--------------------+--------------------+-------------------------+------+-----------------+----------------+
|Course_ID|        Course Title| Course Organisation|Course Certification Type|Rating|course_difficulty|Student Enrolled|
+---------+--------------------+--------------------+-------------------------+------+-----------------+----------------+
|      134|isc   systems sec...|                 isc|           SPECIALIZATION|   4.7|         Beginner|            5.3k|
|      743|a crash course in...|university of pen...|                   COURSE|   4.7|     Intermediate|             17k|
|      874|a crash course in...|johns hopkins uni...|                   COURSE|   4.5|            Mixed|            130k|
|      413|a law student s t...|     yale university|                   COURSE|   4.7|            Mixed|             91k|
|      635|a life of happine...|indian school of ...|                   COURSE|   4.8|            Mixed|            320k|
+---------+-------------

In [None]:
#languae check 
#spelling corector


In [5]:
#tokenise
from pyspark.ml.feature import Tokenizer

tokenizer1 = Tokenizer(inputCol="Course Title", outputCol="Course Title_tokenise")
df_Title_tokenise= tokenizer1.transform(df_with_schema).select("Course Title_tokenise")

tokenizer = Tokenizer(inputCol="Course Organisation", outputCol="Course Organisation_tokenise")
df_Organisation_tokenise= tokenizer.transform(df_with_schema).select("Course Organisation_tokenise")


In [6]:
df_Organisation_tokenise.show()
df_Title_tokenise.show()

+----------------------------+
|Course Organisation_tokenise|
+----------------------------+
|                       [isc]|
|        [university, of, ...|
|        [johns, hopkins, ...|
|          [yale, university]|
|        [indian, school, ...|
|        [university, at, ...|
|          [deeplearning, ai]|
|          [deeplearning, ai]|
|                       [ibm]|
|          [deeplearning, ai]|
|          [deeplearning, ai]|
|          [deeplearning, ai]|
|        [amazon, web, ser...|
|        [amazon, web, ser...|
|        [amazon, web, ser...|
|        [amazon, web, ser...|
|        [amazon, web, ser...|
|        [university, of, ...|
|        [university, of, ...|
|        [university, of, ...|
+----------------------------+
only showing top 20 rows

+---------------------+
|Course Title_tokenise|
+---------------------+
| [isc, , , systems...|
| [a, crash, course...|
| [a, crash, course...|
| [a, law, student,...|
| [a, life, of, hap...|
| [adhd, , everyday...|
|  [ai, for, e

In [7]:
#stopwords
from pyspark.ml.feature import StopWordsRemover


remover1 = StopWordsRemover()
stopwords = remover1.getStopWords()
remover1.setInputCol("Course Organisation_tokenise")
remover1.setOutputCol("Course Organisation_tokenise_no_stopw")
Course_Organisation_no_stopw_df = remover1.transform(df_Organisation_tokenise).select("Course Organisation_tokenise_no_stopw")



remover2 = StopWordsRemover()
stopwords = remover2.getStopWords()
remover2.setInputCol("Course Title_tokenise")
remover2.setOutputCol("Course Title_tokenise_no_stopw")
Course_Title_no_stopw_df = remover2.transform(df_Title_tokenise).select("Course Title_tokenise_no_stopw")


In [8]:
#Course_Organisation_no_stopw_df.show()
Course_Title_no_stopw_df.show()

+------------------------------+
|Course Title_tokenise_no_stopw|
+------------------------------+
|          [isc, , , systems...|
|          [crash, course, c...|
|          [crash, course, d...|
|          [law, student, to...|
|          [life, happiness,...|
|          [adhd, , everyday...|
|                [ai, everyone]|
|          [ai, medical, tre...|
|          [ai, foundations,...|
|          [ai, medical, dia...|
|          [ai, medical, pro...|
|                [ai, medicine]|
|           [aws, fundamentals]|
|          [aws, fundamental...|
|          [aws, fundamental...|
|          [aws, fundamental...|
|          [aws, fundamental...|
|          [aboriginal, worl...|
|          [academic, englis...|
|          [accelerated, com...|
+------------------------------+
only showing top 20 rows



In [None]:
!pip install fasttext



In [9]:
#https://analyticsindiamag.com/hands-on-guide-to-word-embeddings-using-glove/
import os
import urllib.request
import matplotlib.pyplot as plt
from scipy import spatial
from sklearn.manifold import TSNE
import numpy as np

In [62]:
urllib.request.urlretrieve('https://nlp.stanford.edu/data/glove.6B.zip','glove.6B.zip')
!unzip "/content/glove.6B.zip"

KeyboardInterrupt: ignored

In [10]:
emmbed_dict = {}
with open('/content/glove.6B.50d.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    emmbed_dict[word]=vector


   

In [11]:
list12=[]
list1=emmbed_dict["hi"]
for i in list1:
  list12.append(i)
list2=emmbed_dict["bro"]
for i in list2:
  list12.append(i)

list12

[-0.54313,
 0.34427,
 0.27125,
 1.0487,
 -1.1642,
 -1.2722,
 0.35781,
 -0.56527,
 -0.29879,
 0.85179,
 0.52222,
 -0.0019718,
 -0.46435,
 0.033631,
 0.048367,
 0.78762,
 0.075995,
 0.51577,
 0.34778,
 0.53802,
 0.28299,
 -0.1313,
 -0.073753,
 0.42614,
 0.030954,
 -0.55033,
 -0.99789,
 -0.28947,
 0.30517,
 -1.1194,
 1.2957,
 0.91165,
 0.32222,
 0.93405,
 -0.34152,
 -0.62713,
 -0.092165,
 0.50901,
 0.29204,
 -0.20122,
 0.19614,
 -0.45882,
 1.1099,
 -0.68737,
 1.5724,
 -0.10446,
 0.23594,
 -0.56594,
 0.43676,
 0.98093,
 -0.38585,
 0.15491,
 -0.35731,
 -0.62233,
 -0.50594,
 -0.12473,
 0.44711,
 -0.29007,
 -0.51312,
 -0.34471,
 -0.18583,
 0.86388,
 -0.4041,
 -0.74427,
 -0.33632,
 0.2364,
 -0.12313,
 0.28241,
 0.62132,
 0.29675,
 -0.46514,
 -0.38196,
 0.39958,
 0.39733,
 0.41027,
 0.15278,
 -0.5617,
 0.029929,
 -0.32652,
 -0.90463,
 -0.32284,
 0.68215,
 -0.1259,
 0.65031,
 -0.5831,
 -0.11666,
 0.79877,
 -0.58943,
 1.0938,
 0.62151,
 0.33655,
 -0.2608,
 -1.1699,
 -0.048766,
 0.75291,
 0.36884,

In [12]:
def find_similar_word(emmbedes):
  nearest = sorted(emmbed_dict.keys(), key=lambda word: spatial.distance.euclidean(emmbed_dict[word], emmbedes))
  return nearest

In [13]:

find_similar_word(emmbed_dict["isc"])[0:10]

['isc', 'iasc', 'acca', 'asc', 'sra', 'clc', 'gtc', 'amba', 'isb', 'tcc']

In [14]:
#check for emply spaces
@udf(returnType=FloatType())
def word_vec(text):

    with open('/content/glove.6B.50d.txt','r') as f:
        for line in f:
          values = line.split()
          word = values[0]
          vector = np.asarray(values[1:],'float32')
          emmbed_dict[word]=vector
          word2vec_list=[]
          for words in text:
            if len(words)>1:
               vec=emmbed_dict[text]
               for i in vec:
                 word2vec_list.append(i)  
          return(word2vec_list)  

            

In [15]:

# Below are quick examples
# Using df.to_numpy() method.


# Convert specific column to numpy array.
pandasDF_title = Course_Title_no_stopw_df.toPandas()
print(pandasDF_title)




                        Course Title_tokenise_no_stopw
0    [isc, , , systems, security, certified, practi...
1    [crash, course, causality, , , inferring, caus...
2                       [crash, course, data, science]
3                              [law, student, toolkit]
4                       [life, happiness, fulfillment]
..                                                 ...
840             [write, professional, emails, english]
841                              [write, first, novel]
842                                [writing, sciences]
843  [e, learning, ecologies, , innovative, approac...
844                     [ios, app, development, swift]

[845 rows x 1 columns]


In [92]:
!pip install googletrans

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.2 MB/s 
Collecting hstspreload
  Downloading hstspreload-2021.12.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 10.1 MB/s 
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.1 MB/s 
[?25hCollecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.3 MB/s 
[?25hCollecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 3.9 MB/s 
[?25hCollecting hyperframe<6,>=5.2.0
  Downloading hyperframe-5.2.0-py2

In [16]:
from googletrans import Translator
import time
for title in pandasDF_title["Course Title_tokenise_no_stopw"]:
  print("Title::::",title)
  
  vect_word=[]
  for words in title:
    if len(words) >1:
        print("Words:::::",words)
        time.sleep(1)
         
        try:
            vec=emmbed_dict[words]
            print("Word to vector:::",vec)
            
            for i in vec:
                vect_word.append(i)

        except:
          print("Adding zeros........")
          
          for i in range(0,50):
              vect_word.append(0)
   
                    
            
        
             
  






[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0.3505     0.47577  ]
Words::::: lifecycle
Word to vector::: [ 1.4304   -0.1765   -0.4981    0.28046  -0.50782   0.22503   0.21176
 -1.3831    1.5038    0.42261   0.6498    0.26753  -0.38346  -0.64164
  0.33974  -0.17874  -0.12311   0.5156    1.0581   -0.47244   0.069606
 -0.59469  -0.8349    0.062369 -0.31252   1.7006    0.53305  -0.20277
 -0.11023   0.78985   1.0232   -0.091621 -0.49417  -0.31405  -0.48516
  0.84373  -0.19373   0.24859   0.67772  -0.59678   0.54902  -1.5021
 -0.87419   0.74147  -0.64884  -0.47666   1.4977    0.30523   0.12266
  0.30392 ]
Title:::: ['software', 'processes', 'agile', 'practices']
Words::::: software
Word to vector::: [ 0.40093  -1.1683    1.2586    1.3243   -0.60614  -0.36914  -1.2668
 -1.7872    0.67804   1.1791    0.72633   0.7432   -0.32346  -0.31177
 -0.025236  0.072382 -0.84757   0.70602   0.88824  -0.67657   0.54001
 -0.11735  -0.90053   0.4227    0.21735  -0.5767   -0.40095  -1.1

In [21]:
import numpy as np
np_vect_word=np.asarray(vect_word)
np_vect_word.shape

(200,)

In [118]:
try:
    a=emmbed_dict.get["xqxx"]
    print("found")
except:
  print(":not found")    

:not found


In [68]:
#Course_Organisation_no_stopw_df.show()
Course_Title_no_stopw_df.columns

['Course Title_tokenise_no_stopw']

In [75]:
numpy_course_name=Course_Title_no_stopw_df['Course Title_tokenise_no_stopw'].to_numpy()

TypeError: ignored

In [70]:
Course_Title_no_stopw_df.select("Title_word2vec").show()

PythonException: ignored

In [None]:
@udf(returnType=FloatType())
def names_without_spaces(text):
    for words in text:
        if len(words) >1:
          return(emmbed_dict[text])   

            else:
              return()

In [None]:
word_vec("cool")

TypeError: ignored

In [80]:

pandasDF = Course_Title_no_stopw_df.toPandas()
print(pandasDF)


                        Course Title_tokenise_no_stopw
0    [isc, , , systems, security, certified, practi...
1    [crash, course, causality, , , inferring, caus...
2                       [crash, course, data, science]
3                              [law, student, toolkit]
4                       [life, happiness, fulfillment]
..                                                 ...
886                     [программирование, на, python]
887            [психолингвистика, , psycholinguistics]
888  [разработка, интерфейсов, , вёрстка, и, javasc...
889                        [русский, как, иностранный]
890  [финансовые, инструменты, для, частного, инвес...

[891 rows x 1 columns]


In [24]:
print(pandasDF["Course Organisation_tokenise_no_stopw"].values[0] )


['isc']


In [26]:

DF_word_numeric = udf(lambda m: word_vec(m))
df_org_vector=Course_Organisation_no_stopw_df.withColumn("Course Organisation_numeric",DF_word_numeric ("Course Organisation_tokenise_no_stopw"))
df_org_vector.show()



PythonException: ignored

In [25]:
df_title_vector=Course_Title_no_stopw_df.withColumn("Course Title_numeric",DF_word_numeric ("Course Title_tokenise_no_stopw"))
df_title_vector.show(5)




NameError: ignored

In [None]:
df_with_schema=df_with_schema.withColumn("Course Organisation_numeric}",DF_word_numeric ("Course Organisation"))
df_with_schema.show(5)

In [None]:
word_vec("cool")[0:50]

['text',
 'reference',
 'translation',
 'document',
 'copy',
 'texts',
 'read',
 'translated',
 'addresses',
 'letters',
 'references',
 'translations',
 'page',
 'description',
 'printed',
 'written',
 'notes',
 'note',
 'words',
 'word',
 'reads',
 'describing',
 'refers',
 'explaining',
 'instructions',
 'context',
 'publish',
 'phrase',
 'follows',
 'phrases',
 'message',
 'mentioned',
 'preface',
 'letter',
 'introduction',
 'instance',
 'stating',
 'uses',
 'mentions',
 'passages',
 'contents',
 'descriptions',
 'articles',
 'edit',
 'referred',
 'quotations',
 'article',
 'example',
 'explicit',
 'interpreted']

In [None]:
Course_Organisation_no_stopw_df.show()

+-------------------------------------+
|Course Organisation_tokenise_no_stopw|
+-------------------------------------+
|                                [isc]|
|                 [university, penn...|
|                 [johns, hopkins, ...|
|                   [yale, university]|
|                 [indian, school, ...|
|                 [university, buff...|
|                   [deeplearning, ai]|
|                   [deeplearning, ai]|
|                                [ibm]|
|                   [deeplearning, ai]|
|                   [deeplearning, ai]|
|                   [deeplearning, ai]|
|                 [amazon, web, ser...|
|                 [amazon, web, ser...|
|                 [amazon, web, ser...|
|                 [amazon, web, ser...|
|                 [amazon, web, ser...|
|                 [university, toro...|
|                 [university, cali...|
|                 [university, illi...|
+-------------------------------------+
only showing top 20 rows



In [None]:
joined_title_1=[]
for i,title in enumerate(numpy_organisation):
  
  for j in range(len(title[0])):
      joined="".join(str(title[0][j]))
  joined_title_1.append(joined)  


Tit_jponed=" ".join(joined_comments)
comments_jponed   

In [None]:
str_organisation=[]
for i in numpy_organisation:
  text_org=[]
  text=i[0]
  
  for j in range(len(text)):
    #print(j)
    text_org.append(str(i[0][j]))
  #print(text_org)
  str_organisation.append(text_org)  

str_organisation

[['isc'],
 ['univers', 'pennsylvania'],
 ['john', 'hopkin', 'univers'],
 ['yale', 'univers'],
 ['indian', 'school', 'busi'],
 ['univers', 'buffalo'],
 ['deeplearn'],
 ['deeplearn'],
 ['ibm'],
 ['deeplearn'],
 ['deeplearn'],
 ['deeplearn'],
 ['amazon', 'web', 'servic'],
 ['amazon', 'web', 'servic'],
 ['amazon', 'web', 'servic'],
 ['amazon', 'web', 'servic'],
 ['amazon', 'web', 'servic'],
 ['univers', 'toronto'],
 ['univers', 'california', 'irvin'],
 ['univers', 'illinoi', 'urbana', 'champaign'],
 ['isc'],
 ['univers', 'pennsylvania'],
 ['univers', 'michigan'],
 ['univers', 'pennsylvania'],
 ['universidad', 'nacion', 'autónoma', 'xico'],
 ['yale', 'univers'],
 ['univers', 'colorado', 'boulder'],
 ['ibm'],
 ['nation', 'research', 'univers', 'higher', 'school', 'econom'],
 ['googl', 'cloud'],
 ['erasmu', 'univers', 'rotterdam'],
 ['duke', 'univers'],
 ['yale', 'univers'],
 ['univers', 'virginia'],
 ['univers', 'virginia'],
 ['atlassian'],
 ['scrumtrek'],
 ['univers', 'california', 'san', '

In [None]:
removed_organisation_indices=[]
str_organisation_sanitised=[]
for i,content in enumerate(str_organisation):
  if len(content)!=0:
    str_organisation_sanitised.append(content)
  else:
    removed_organisation_indices.append(i)


In [None]:
docs_organisation=" ".join(joined_organisation)
docs_organisation

'isc pennsylvania univers univers busi buffalo deeplearn deeplearn ibm deeplearn deeplearn deeplearn servic servic servic servic servic toronto irvin champaign isc pennsylvania michigan pennsylvania xico univers boulder ibm econom cloud rotterdam univers univers virginia virginia atlassian scrumtrek diego univers system univers pennsylvania school irvin univers michigan univers michigan pennsylvania univers edinburgh univers xico dtu ibm ibm michigan michigan michigan austral xico diego cloud cloud cloud univers art art strategi bocconi virginia edinburgh chile davi autodesk autodesk autodesk autodesk univers amsterdam cruz cruz share univers system univers town univers michigan barcelona univers diego univers boulder buffalo insead insead academi pari school school irvin alberta jerusalem cloud cloud cloud cloud pari pennsylvania univers washington washington pennsylvania pari univers pennsylvania virginia busi cloud boulder pennsylvania cruz system univers osmosi univers chile michig

In [None]:
def count_frequency(comments): 
  wordlist = comments.split() 
  wordfreq = []
  for w in wordlist:
      wordfreq.append(wordlist.count(w))

  print("String\n" + docs_organisation +"\n")
  #print("List\n" + str(wordlist) + "\n")
  #print("Frequencies\n" + str(wordfreq) + "\n")
  #print("Pairs\n" + str(list(zip(wordlist, wordfreq))))

  print(len(wordfreq))
  print(len(wordlist))

  Word_frequency={}

  for key,value in zip(wordlist,wordfreq):
      Word_frequency[key] = value

  return (Word_frequency)


In [None]:
frequency_comments=count_frequency(docs_organisation)

String
isc pennsylvania univers univers busi buffalo deeplearn deeplearn ibm deeplearn deeplearn deeplearn servic servic servic servic servic toronto irvin champaign isc pennsylvania michigan pennsylvania xico univers boulder ibm econom cloud rotterdam univers univers virginia virginia atlassian scrumtrek diego univers system univers pennsylvania school irvin univers michigan univers michigan pennsylvania univers edinburgh univers xico dtu ibm ibm michigan michigan michigan austral xico diego cloud cloud cloud univers art art strategi bocconi virginia edinburgh chile davi autodesk autodesk autodesk autodesk univers amsterdam cruz cruz share univers system univers town univers michigan barcelona univers diego univers boulder buffalo insead insead academi pari school school irvin alberta jerusalem cloud cloud cloud cloud pari pennsylvania univers washington washington pennsylvania pari univers pennsylvania virginia busi cloud boulder pennsylvania cruz system univers osmosi univers chile 

In [None]:
removed_organisation_indices=[]
str_organisation_sanitised=[]
for i,content in enumerate(str_organisation):
  if len(content)!=0:
    str_organisation_sanitised.append(content)
  else:
    removed_organisation_indices.append(i)
str_organisation_sanitised

[['isc'],
 ['univers', 'pennsylvania'],
 ['john', 'hopkin', 'univers'],
 ['yale', 'univers'],
 ['indian', 'school', 'busi'],
 ['univers', 'buffalo'],
 ['deeplearn'],
 ['deeplearn'],
 ['ibm'],
 ['deeplearn'],
 ['deeplearn'],
 ['deeplearn'],
 ['amazon', 'web', 'servic'],
 ['amazon', 'web', 'servic'],
 ['amazon', 'web', 'servic'],
 ['amazon', 'web', 'servic'],
 ['amazon', 'web', 'servic'],
 ['univers', 'toronto'],
 ['univers', 'california', 'irvin'],
 ['univers', 'illinoi', 'urbana', 'champaign'],
 ['isc'],
 ['univers', 'pennsylvania'],
 ['univers', 'michigan'],
 ['univers', 'pennsylvania'],
 ['universidad', 'nacion', 'autónoma', 'xico'],
 ['yale', 'univers'],
 ['univers', 'colorado', 'boulder'],
 ['ibm'],
 ['nation', 'research', 'univers', 'higher', 'school', 'econom'],
 ['googl', 'cloud'],
 ['erasmu', 'univers', 'rotterdam'],
 ['duke', 'univers'],
 ['yale', 'univers'],
 ['univers', 'virginia'],
 ['univers', 'virginia'],
 ['atlassian'],
 ['scrumtrek'],
 ['univers', 'california', 'san', '

In [None]:
doc_orgs=[]
def convert_stng(data):
  text=""
  for i in data:
      text=text+" "+str(i)
  return(text.strip())
for i in str_organisation_sanitised:
  doc_orgs.append(convert_stng(i))
doc_orgs

['isc',
 'univers pennsylvania',
 'john hopkin univers',
 'yale univers',
 'indian school busi',
 'univers buffalo',
 'deeplearn',
 'deeplearn',
 'ibm',
 'deeplearn',
 'deeplearn',
 'deeplearn',
 'amazon web servic',
 'amazon web servic',
 'amazon web servic',
 'amazon web servic',
 'amazon web servic',
 'univers toronto',
 'univers california irvin',
 'univers illinoi urbana champaign',
 'isc',
 'univers pennsylvania',
 'univers michigan',
 'univers pennsylvania',
 'universidad nacion autónoma xico',
 'yale univers',
 'univers colorado boulder',
 'ibm',
 'nation research univers higher school econom',
 'googl cloud',
 'erasmu univers rotterdam',
 'duke univers',
 'yale univers',
 'univers virginia',
 'univers virginia',
 'atlassian',
 'scrumtrek',
 'univers california san diego',
 'stanford univers',
 'univers colorado system',
 'yale univers',
 'univers pennsylvania',
 'copenhagen busi school',
 'univers california irvin',
 'macquari univers',
 'univers michigan',
 'yale univers',
 '

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer2 = CountVectorizer()
bog_vector= vectorizer2.fit_transform(doc_orgs)
vectorizer2.get_feature_names_out()

array(['academi', 'administração', 'alberta', 'alto', 'amazon',
       'american', 'amsterdam', 'and', 'anywher', 'architectur',
       'arizona', 'art', 'atlassian', 'austral', 'autodesk', 'autom',
       'autònoma', 'autónoma', 'aviv', 'barcelona', 'bcg', 'berkle',
       'birkbeck', 'bloomsburi', 'bocconi', 'boulder', 'buffalo', 'busi',
       'california', 'campina', 'cape', 'carolina', 'case', 'católica',
       'champaign', 'chang', 'chapel', 'chicago', 'chile', 'cisco',
       'cloud', 'cloudera', 'cole', 'colleg', 'colorado', 'columbia',
       'con', 'consensi', 'copenhagen', 'crece', 'cruz', 'davi',
       'deeplearn', 'denmark', 'design', 'develop', 'diego', 'dtu',
       'duke', 'econom', 'edhec', 'edinburgh', 'educ', 'eindhoven',
       'emori', 'erasmu', 'esad', 'escp', 'essec', 'estadu', 'exchang',
       'financ', 'florida', 'forum', 'foundat', 'fund', 'fundação',
       'futur', 'geneva', 'georg', 'georgia', 'giesk', 'goldsmith',
       'googl', 'graduat', 'great', 'gr

In [None]:
Organisation_bog=bog_vector.toarray()
Organisation_bog.shape

(888, 233)

In [None]:
numpy_Title

array([[list(['isc', 'system', 'secur', 'certifi', 'practition', 'sscp'])],
       [list(['crash', 'cours', 'causal', 'infer', 'causal', 'effect', 'observ', 'data'])],
       [list(['crash', 'cours', 'data', 'scienc'])],
       [list(['law', 'student', 'toolkit'])],
       [list(['life', 'happi', 'fulfil'])],
       [list(['adhd', 'everyday', 'strategi', 'elementari', 'student'])],
       [list(['everyon'])],
       [list(['medic', 'treatment'])],
       [list(['foundat', 'everyon'])],
       [list(['medic', 'diagnosi'])],
       [list(['medic', 'prognosi'])],
       [list(['medicin'])],
       [list(['fundament'])],
       [list(['fundament', 'address', 'secur', 'risk'])],
       [list(['fundament', 'build', 'serverless', 'applic'])],
       [list(['fundament', 'cloud', 'nativ'])],
       [list(['fundament', 'migrat', 'cloud'])],
       [list(['aborigin', 'worldview', 'educ'])],
       [list(['academ', 'english', 'write'])],
       [list(['acceler', 'comput', 'scienc', 'fundament'])],

In [None]:

doc_title=[]
def convert_stng(data):
  text=""
  for i in data:
      text=text+" "+str(i)
  return(text.strip())

for i in str_organisation_sanitised:
  doc_title.append(convert_stng(i))
doc_title

['isc',
 'univers pennsylvania',
 'john hopkin univers',
 'yale univers',
 'indian school busi',
 'univers buffalo',
 'deeplearn',
 'deeplearn',
 'ibm',
 'deeplearn',
 'deeplearn',
 'deeplearn',
 'amazon web servic',
 'amazon web servic',
 'amazon web servic',
 'amazon web servic',
 'amazon web servic',
 'univers toronto',
 'univers california irvin',
 'univers illinoi urbana champaign',
 'isc',
 'univers pennsylvania',
 'univers michigan',
 'univers pennsylvania',
 'universidad nacion autónoma xico',
 'yale univers',
 'univers colorado boulder',
 'ibm',
 'nation research univers higher school econom',
 'googl cloud',
 'erasmu univers rotterdam',
 'duke univers',
 'yale univers',
 'univers virginia',
 'univers virginia',
 'atlassian',
 'scrumtrek',
 'univers california san diego',
 'stanford univers',
 'univers colorado system',
 'yale univers',
 'univers pennsylvania',
 'copenhagen busi school',
 'univers california irvin',
 'macquari univers',
 'univers michigan',
 'yale univers',
 '

In [None]:
!pip install langdetect

  



In [None]:
from langdetect import detect
def lang_process(data):
  lang_detected=detect(data)
  print(lang_detected,data)


for i in doc_title:
  lang_process(i)

de isc
no univers pennsylvania
fi john hopkin univers
tr yale univers
en indian school busi
en univers buffalo
nl deeplearn
nl deeplearn
tl ibm
nl deeplearn
nl deeplearn
nl deeplearn
en amazon web servic
en amazon web servic
en amazon web servic
en amazon web servic
en amazon web servic
en univers toronto
en univers california irvin
it univers illinoi urbana champaign
de isc
no univers pennsylvania
de univers michigan
no univers pennsylvania
es universidad nacion autónoma xico
tr yale univers
es univers colorado boulder
tl ibm
en nation research univers higher school econom
cy googl cloud
no erasmu univers rotterdam
no duke univers
tr yale univers
lt univers virginia
lt univers virginia
en atlassian
ro scrumtrek
it univers california san diego
no stanford univers
es univers colorado system
tr yale univers
no univers pennsylvania
nl copenhagen busi school
en univers california irvin
it macquari univers
de univers michigan
tr yale univers
de univers michigan
no univers pennsylvania
no va

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer3 = CountVectorizer()
bog_vector2= vectorizer3.fit_transform(doc_title)
vectorizer3.get_feature_names_out()

array(['academi', 'administração', 'alberta', 'alto', 'amazon',
       'american', 'amsterdam', 'and', 'anywher', 'architectur',
       'arizona', 'art', 'atlassian', 'austral', 'autodesk', 'autom',
       'autònoma', 'autónoma', 'aviv', 'barcelona', 'bcg', 'berkle',
       'birkbeck', 'bloomsburi', 'bocconi', 'boulder', 'buffalo', 'busi',
       'california', 'campina', 'cape', 'carolina', 'case', 'católica',
       'champaign', 'chang', 'chapel', 'chicago', 'chile', 'cisco',
       'cloud', 'cloudera', 'cole', 'colleg', 'colorado', 'columbia',
       'con', 'consensi', 'copenhagen', 'crece', 'cruz', 'davi',
       'deeplearn', 'denmark', 'design', 'develop', 'diego', 'dtu',
       'duke', 'econom', 'edhec', 'edinburgh', 'educ', 'eindhoven',
       'emori', 'erasmu', 'esad', 'escp', 'essec', 'estadu', 'exchang',
       'financ', 'florida', 'forum', 'foundat', 'fund', 'fundação',
       'futur', 'geneva', 'georg', 'georgia', 'giesk', 'goldsmith',
       'googl', 'graduat', 'great', 'gr

In [None]:
Title_bog=bog_vector2.toarray()
Title_bog.shape

(888, 233)

In [None]:
#delete the array
#title
"""
print(Title_bog.shape)
Title_bog=np.delete(Title_bog,removed_organisation_indices,axis=0)
print(Title_bog.shape)
#organisation
print(Organisation_bog.shape)
Organisation_bog=np.delete(Organisation_bog,removed_organisation_indices,axis=0)
print(Organisation_bog.shape)
"""
#Certification
print(encoded_Certification.shape)
encoded_Certification=np.delete(encoded_Certification,removed_organisation_indices,axis=0)
print(encoded_Certification.shape)


#level
print(encoded_Level.shape)
encoded_Level=np.delete(encoded_Level,removed_organisation_indices,axis=0)
print(encoded_Level.shape)
#rating
print(scaled_rating.shape)
scaled_rating=np.delete(scaled_rating,removed_organisation_indices,axis=0)
print(scaled_rating.shape)
#opted
print(scaled_opted.shape)
scaled_opted=np.delete(scaled_opted,removed_organisation_indices,axis=0)
print(scaled_opted.shape)


(891,)
(888,)
(891,)
(888,)
(891, 1)
(888, 1)
(891, 1)
(888, 1)


In [None]:
encoded_Certification=encodeing_label([ i[0][0] for i in numpy_Certificate ])
 
encoded_Level=encodeing_label([ i[0][0] for i in numpy_Level])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(scaled_opted, test_size=0.33, random_state=42)

In [None]:
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=7, min_samples=10).fit(X_train)